gateway/modules/services/serviceGeneration/renderers/rendererHtml.py
2025-12-30 11:11:31 +01:00

826 lines
39 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
HTML renderer for report generation.
"""
from .rendererBaseTemplate import BaseRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List
class RendererHtml(BaseRenderer):
"""Renders content to HTML format with format-specific extraction."""
@classmethod
def getSupportedFormats(cls) -> List[str]:
"""Return supported HTML formats."""
return ['html', 'htm']
@classmethod
def getFormatAliases(cls) -> List[str]:
"""Return format aliases."""
return ['web', 'webpage']
@classmethod
def getPriority(cls) -> int:
"""Return priority for HTML renderer."""
return 100
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
"""
Render HTML document with images as separate files.
Returns list of documents: [HTML document, image1, image2, ...]
"""
import base64
# Extract images first
images = self._extractImages(extractedContent)
# Store images in instance for later retrieval
self._renderedImages = images
# Generate HTML using AI-analyzed styling
htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService)
# Replace base64 data URIs with relative file paths if images exist
if images:
htmlContent = self._replaceImageDataUris(htmlContent, images)
# Determine HTML filename from document or title
documents = extractedContent.get("documents", [])
if documents and isinstance(documents[0], dict):
htmlFilename = documents[0].get("filename")
if not htmlFilename:
htmlFilename = self._determineFilename(title, "text/html")
else:
htmlFilename = self._determineFilename(title, "text/html")
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
# Start with HTML document
resultDocuments = [
RenderedDocument(
documentData=htmlContent.encode('utf-8'),
mimeType="text/html",
filename=htmlFilename,
documentType=documentType,
metadata=metadata if isinstance(metadata, dict) else None
)
]
# Add images as separate documents
for img in images:
base64Data = img.get("base64Data", "")
filename = img.get("filename", f"image_{len(resultDocuments)}.png")
mimeType = img.get("mimeType", "image/png")
if base64Data:
try:
# Decode base64 to bytes
imageBytes = base64.b64decode(base64Data)
resultDocuments.append(
RenderedDocument(
documentData=imageBytes,
mimeType=mimeType,
filename=filename
)
)
self.logger.debug(f"Added image file: {filename} ({len(imageBytes)} bytes)")
except Exception as e:
self.logger.warning(f"Error creating image file {filename}: {str(e)}")
return resultDocuments
async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
"""Generate HTML content from structured JSON document using AI-generated styling."""
try:
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
# Validate JSON structure
if not self._validateJsonStructure(jsonContent):
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
# Extract sections and metadata from standardized schema
sections = self._extractSections(jsonContent)
metadata = self._extractMetadata(jsonContent)
# Use title from JSON metadata if available, otherwise use provided title
documentTitle = metadata.get("title", title)
# Build HTML document
htmlParts = []
# HTML document structure
htmlParts.append('<!DOCTYPE html>')
htmlParts.append('<html lang="en">')
htmlParts.append('<head>')
htmlParts.append('<meta charset="UTF-8">')
htmlParts.append('<meta name="viewport" content="width=device-width, initial-scale=1.0">')
htmlParts.append(f'<title>{documentTitle}</title>')
htmlParts.append('<style>')
htmlParts.append(self._generateCssStyles(styles))
htmlParts.append('</style>')
htmlParts.append('</head>')
htmlParts.append('<body>')
# Document header
htmlParts.append(f'<header><h1 class="document-title">{documentTitle}</h1></header>')
# Main content
htmlParts.append('<main>')
# Process each section
for section in sections:
sectionHtml = self._renderJsonSection(section, styles)
if sectionHtml:
htmlParts.append(sectionHtml)
htmlParts.append('</main>')
# Footer
htmlParts.append('<footer>')
htmlParts.append(f'<p class="generated-info">Generated: {self._formatTimestamp()}</p>')
htmlParts.append('</footer>')
htmlParts.append('</body>')
htmlParts.append('</html>')
return '\n'.join(htmlParts)
except Exception as e:
self.logger.error(f"Error generating HTML from JSON: {str(e)}")
raise Exception(f"HTML generation failed: {str(e)}")
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
"""Get style set - use styles from document generation metadata if available,
otherwise enhance default styles with AI if userPrompt provided.
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
not be generated separately by renderers. Only fall back to AI if styles not provided.
Args:
extractedContent: Document content with metadata (may contain styles)
userPrompt: User's prompt (AI will detect style instructions in any language)
aiService: AI service (used only if styles not in metadata and userPrompt provided)
templateName: Name of template style set (None = default)
Returns:
Dict with style definitions for all document styles
"""
# Get default style set
defaultStyleSet = self._getDefaultStyleSet()
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
if extractedContent:
metadata = extractedContent.get("metadata", {})
if isinstance(metadata, dict):
styles = metadata.get("styles")
if styles and isinstance(styles, dict):
self.logger.debug("Using styles from document generation metadata")
return self._validateStylesContrast(styles)
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
if userPrompt and aiService:
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
return self._validateStylesContrast(enhancedStyleSet)
else:
# Use default styles only
return defaultStyleSet
async def _enhanceStylesWithAI(self, userPrompt: str, defaultStyleSet: Dict[str, Any], aiService) -> Dict[str, Any]:
"""Enhance default styles with AI based on user prompt."""
try:
style_template = self._createAiStyleTemplate("html", userPrompt, defaultStyleSet)
enhanced_styles = await self._getAiStyles(aiService, style_template, defaultStyleSet)
return enhanced_styles
except Exception as e:
self.logger.warning(f"AI style enhancement failed: {str(e)}, using default styles")
return defaultStyleSet
def _validateStylesContrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Validate and fix contrast issues in AI-generated styles."""
try:
# Fix table header contrast
if "table_header" in styles:
header = styles["table_header"]
bgColor = header.get("background", "#FFFFFF")
textColor = header.get("color", "#000000")
# If both are white or both are dark, fix it
if bgColor.upper() == "#FFFFFF" and textColor.upper() == "#FFFFFF":
header["background"] = "#4F4F4F"
header["color"] = "#FFFFFF"
elif bgColor.upper() == "#000000" and textColor.upper() == "#000000":
header["background"] = "#4F4F4F"
header["color"] = "#FFFFFF"
# Fix table cell contrast
if "table_cell" in styles:
cell = styles["table_cell"]
bgColor = cell.get("background", "#FFFFFF")
textColor = cell.get("color", "#000000")
# If both are white or both are dark, fix it
if bgColor.upper() == "#FFFFFF" and textColor.upper() == "#FFFFFF":
cell["background"] = "#FFFFFF"
cell["color"] = "#2F2F2F"
elif bgColor.upper() == "#000000" and textColor.upper() == "#000000":
cell["background"] = "#FFFFFF"
cell["color"] = "#2F2F2F"
return styles
except Exception as e:
self.logger.warning(f"Style validation failed: {str(e)}")
return self._getDefaultStyleSet()
def _getDefaultStyleSet(self) -> Dict[str, Any]:
"""Default HTML style set - used when no style instructions present."""
return {
"title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"},
"heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"},
"heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"},
"paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"},
"table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"},
"table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"},
"table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"},
"bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"},
"code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"},
"image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"},
"body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"}
}
def _generateCssStyles(self, styles: Dict[str, Any]) -> str:
"""Generate CSS from style definitions."""
css_parts = []
# Body styles
body_style = styles.get("body", {})
css_parts.append("body {")
for property_name, value in body_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Document title
title_style = styles.get("title", {})
css_parts.append(".document-title {")
for property_name, value in title_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Headings
for heading_level in ["heading1", "heading2"]:
heading_style = styles.get(heading_level, {})
css_class = f"h{heading_level[-1]}"
css_parts.append(f"{css_class} {{")
for property_name, value in heading_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Paragraphs
paragraph_style = styles.get("paragraph", {})
css_parts.append("p {")
for property_name, value in paragraph_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Tables
table_style = styles.get("table", {})
css_parts.append("table {")
for property_name, value in table_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Table headers
table_header_style = styles.get("table_header", {})
css_parts.append("th {")
for property_name, value in table_header_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Table cells
table_cell_style = styles.get("table_cell", {})
css_parts.append("td {")
for property_name, value in table_cell_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Lists
bullet_list_style = styles.get("bullet_list", {})
css_parts.append("ul {")
for property_name, value in bullet_list_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Code blocks
code_block_style = styles.get("code_block", {})
css_parts.append("pre {")
for property_name, value in code_block_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Images
image_style = styles.get("image", {})
css_parts.append("img {")
for property_name, value in image_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Generated info
css_parts.append(".generated-info {")
css_parts.append(" font-size: 0.9em;")
css_parts.append(" color: #666;")
css_parts.append(" text-align: center;")
css_parts.append(" margin-top: 2em;")
css_parts.append(" padding-top: 1em;")
css_parts.append(" border-top: 1px solid #ddd;")
css_parts.append("}")
return '\n'.join(css_parts)
def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a single JSON section to HTML using AI-generated styles.
Supports three content formats: reference, object (base64), extracted_text.
WICHTIG: Respektiert sectionType (content_type) für korrekte Rendering-Logik.
"""
try:
sectionType = self._getSectionType(section)
sectionData = self._getSectionData(section)
# WICHTIG: Respektiere sectionType (content_type) ZUERST, dann process elements entsprechend
# Process elements according to section's content_type, not just element types
if sectionType == "table":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonTable(element, styles)
return ""
elif sectionType == "bullet_list":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonBulletList(element, styles)
return ""
elif sectionType == "heading":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonHeading(element, styles)
return ""
elif sectionType == "paragraph":
# Process paragraph elements, including extracted_text
if isinstance(sectionData, list):
htmlParts = []
for element in sectionData:
element_type = element.get("type", "") if isinstance(element, dict) else ""
if element_type == "reference":
doc_ref = element.get("documentReference", "")
label = element.get("label", "Reference")
htmlParts.append(f'<p class="reference"><em>[Reference: {label}]</em></p>')
elif element_type == "extracted_text":
content = element.get("content", "")
source = element.get("source", "")
if content:
source_text = f' <small><em>(Source: {source})</em></small>' if source else ''
htmlParts.append(f'<p>{content}{source_text}</p>')
elif isinstance(element, dict):
# Regular paragraph element - extract from nested content structure (standard JSON format)
content = element.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
htmlParts.append(f'<p>{text}</p>')
elif isinstance(element, str):
htmlParts.append(f'<p>{element}</p>')
if htmlParts:
return '\n'.join(htmlParts)
# If sectionData is not a list, treat it as a dict
if isinstance(sectionData, dict):
return self._renderJsonParagraph(sectionData, styles)
return ""
elif sectionType == "code_block":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonCodeBlock(element, styles)
return ""
elif sectionType == "image":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonImage(element, styles)
return ""
else:
# Fallback: Check for special element types first
if isinstance(sectionData, list):
htmlParts = []
for element in sectionData:
element_type = element.get("type", "") if isinstance(element, dict) else ""
if element_type == "reference":
doc_ref = element.get("documentReference", "")
label = element.get("label", "Reference")
htmlParts.append(f'<p class="reference"><em>[Reference: {label}]</em></p>')
elif element_type == "extracted_text":
content = element.get("content", "")
source = element.get("source", "")
if content:
source_text = f' <small><em>(Source: {source})</em></small>' if source else ''
htmlParts.append(f'<p>{content}{source_text}</p>')
if htmlParts:
return '\n'.join(htmlParts)
# Fallback to paragraph for unknown types
if isinstance(sectionData, dict):
return self._renderJsonParagraph(sectionData, styles)
return ""
except Exception as e:
self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}")
return f'<div class="error">[Error rendering section: {str(e)}]</div>'
def _renderJsonTable(self, tableData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON table to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{headers, rows}
content = tableData.get("content", {})
if not isinstance(content, dict):
return ""
headers = content.get("headers", [])
rows = content.get("rows", [])
if not headers or not rows:
return ""
htmlParts = ['<table>']
# Table header
htmlParts.append('<thead><tr>')
for header in headers:
htmlParts.append(f'<th>{header}</th>')
htmlParts.append('</tr></thead>')
# Table body
htmlParts.append('<tbody>')
for row in rows:
htmlParts.append('<tr>')
for cellData in row:
htmlParts.append(f'<td>{cellData}</td>')
htmlParts.append('</tr>')
htmlParts.append('</tbody>')
htmlParts.append('</table>')
return '\n'.join(htmlParts)
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return ""
def _renderJsonBulletList(self, listData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON bullet list to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{items}
content = listData.get("content", {})
if not isinstance(content, dict):
return ""
items = content.get("items", [])
if not items:
return ""
htmlParts = ['<ul>']
for item in items:
if isinstance(item, str):
htmlParts.append(f'<li>{item}</li>')
elif isinstance(item, dict) and "text" in item:
htmlParts.append(f'<li>{item["text"]}</li>')
htmlParts.append('</ul>')
return '\n'.join(htmlParts)
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
return ""
def _renderJsonHeading(self, headingData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON heading to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{text, level}
content = headingData.get("content", {})
if not isinstance(content, dict):
return ""
text = content.get("text", "")
level = content.get("level", 1)
if text:
level = max(1, min(6, level))
return f'<h{level}>{text}</h{level}>'
return ""
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
return ""
def _renderJsonParagraph(self, paragraphData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON paragraph to HTML using AI-generated styles."""
try:
# Normalize inputs - paragraphData is typically a list of elements from _getSectionData
if isinstance(paragraphData, list):
# Extract text from all paragraph elements (expects nested content structure)
texts = []
for el in paragraphData:
if isinstance(el, dict):
content = el.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
texts.append(text)
elif isinstance(el, str):
texts.append(el)
if texts:
# Join multiple paragraphs with <p> tags
return '\n'.join(f'<p>{text}</p>' for text in texts)
return ""
elif isinstance(paragraphData, str):
return f'<p>{paragraphData}</p>'
elif isinstance(paragraphData, dict):
# Handle nested content structure: element.content vs element.text
# Extract from nested content structure
content = paragraphData.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
return f'<p>{text}</p>'
return ""
else:
return ""
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return ""
def _renderJsonCodeBlock(self, codeData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON code block to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{code, language}
content = codeData.get("content", {})
if not isinstance(content, dict):
return ""
code = content.get("code", "")
language = content.get("language", "")
if code:
if language:
return f'<pre><code class="language-{language}">{code}</code></pre>'
else:
return f'<pre><code>{code}</code></pre>'
return ""
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
return ""
def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON image to HTML with placeholder for later replacement. Expects nested content structure."""
try:
import html
# Extract from nested content structure (standard JSON format)
content = imageData.get("content", {})
if not isinstance(content, dict):
return ""
base64Data = content.get("base64Data", "")
altText = content.get("altText", "Image")
caption = content.get("caption", "")
# Escape HTML in altText and caption to prevent injection
altTextEscaped = html.escape(str(altText))
captionEscaped = html.escape(str(caption)) if caption else ""
if base64Data:
# Use data URI as placeholder - will be replaced with file path in _replaceImageDataUris
# Include a marker so we can find and replace it
imageMarker = f"<!--IMAGE_MARKER:{len(base64Data)}:{altTextEscaped[:50]}-->"
# Add max-width and max-height to ensure image fits within page dimensions
# Typical page width is ~800-1200px, height varies but we limit to 600px for readability
imgTag = f'<img src="data:image/png;base64,{base64Data}" alt="{altTextEscaped}" style="max-width: 100%; max-height: 600px; width: auto; height: auto;">'
if captionEscaped:
return f'{imageMarker}<figure>{imgTag}<figcaption>{captionEscaped}</figcaption></figure>'
else:
return f'{imageMarker}{imgTag}'
return ""
except Exception as e:
self.logger.error(f"Error embedding image in HTML: {str(e)}")
altText = imageData.get("altText", "Image")
errorMsg = html.escape(f"[Error: Could not embed image '{altText}'. {str(e)}]")
return f'<div class="error" style="color: red; padding: 10px; border: 1px solid red;">{errorMsg}</div>'
def _extractImages(self, jsonContent: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Extract all images from JSON structure.
Returns:
List of image data dictionaries with base64Data, altText, caption, sectionId
"""
images = []
try:
# Extract from standardized schema: {metadata: {...}, documents: [{sections: [...]}]}
documents = jsonContent.get("documents", [])
if not documents or not isinstance(documents, list):
return images
for doc in documents:
if not isinstance(doc, dict):
continue
sections = doc.get("sections", [])
for section in sections:
if section.get("content_type") == "image":
elements = section.get("elements", [])
for element in elements:
# Extract from nested content structure
content = element.get("content", {})
base64Data = ""
if isinstance(content, dict):
base64Data = content.get("base64Data", "")
elif isinstance(content, str):
# Content might be base64 string directly (shouldn't happen)
pass
# If base64Data not found in content, try direct element fields (fallback)
if not base64Data:
base64Data = element.get("base64Data", "")
# If base64Data still not found, try extracting from url data URI
if not base64Data:
url = element.get("url", "") or (content.get("url", "") if isinstance(content, dict) else "")
if url and isinstance(url, str) and url.startswith("data:image/"):
# Extract base64 from data URI: data:image/png;base64,<base64>
import re
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
if match:
base64Data = match.group(1)
if base64Data:
sectionId = section.get("id", "unknown")
# Bestimme MIME-Type und Extension
mimeType = element.get("mimeType", "") or (content.get("mimeType", "") if isinstance(content, dict) else "")
if not mimeType or mimeType == "unknown":
# Versuche MIME-Type aus base64 zu erkennen
if base64Data.startswith("/9j/"):
mimeType = "image/jpeg"
elif base64Data.startswith("iVBORw0KGgo"):
mimeType = "image/png"
else:
mimeType = "image/png" # Default
# Bestimme Extension basierend auf MIME-Type
extension = "png"
if mimeType == "image/jpeg" or mimeType == "image/jpg":
extension = "jpg"
elif mimeType == "image/png":
extension = "png"
elif mimeType == "image/gif":
extension = "gif"
elif mimeType == "image/webp":
extension = "webp"
# Generate filename from section ID
filename = f"{sectionId}.{extension}"
# Clean filename (remove invalid characters)
filename = "".join(c if c.isalnum() or c in "._-" else "_" for c in filename)
images.append({
"base64Data": base64Data,
"altText": element.get("altText", "Image"),
"caption": element.get("caption"),
"sectionId": sectionId,
"filename": filename,
"mimeType": mimeType
})
self.logger.debug(f"Extracted image from section {sectionId}: {filename}")
self.logger.info(f"Extracted {len(images)} image(s) from JSON structure")
return images
except Exception as e:
self.logger.warning(f"Error extracting images: {str(e)}")
return []
def _replaceImageDataUris(self, htmlContent: str, images: List[Dict[str, Any]]) -> str:
"""
Replace base64 data URIs in HTML with relative file paths.
Args:
htmlContent: HTML content with data URIs
images: List of image data dictionaries
Returns:
HTML content with relative file paths
"""
try:
import base64
import re
# Find entire img tags with data URIs and replace them
# Pattern: <img src="data:image/[type];base64,<base64>" [other attributes]>
imgTagPattern = r'<img\s+src="data:image/[^"]+"[^>]*>'
def replaceImgTag(match):
imgTag = match.group(0)
# Extract base64 data from the img tag
base64Match = re.search(r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)', imgTag)
if not base64Match:
return imgTag # Return original if no base64 found
base64Data = base64Match.group(1)
# Find matching image in images list
matchingImage = None
for img in images:
imgBase64 = img.get("base64Data", "")
# Vergleiche base64-Daten (kann unterschiedliche Längen haben durch Padding)
if imgBase64 == base64Data or imgBase64.startswith(base64Data[:100]) or base64Data.startswith(imgBase64[:100]):
matchingImage = img
break
if matchingImage:
import html
# Use filename from image data (generated from section ID)
filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png")
# Extract existing alt text or use from matchingImage
altMatch = re.search(r'alt="([^"]*)"', imgTag)
existingAlt = altMatch.group(1) if altMatch else ""
altText = html.escape(str(matchingImage.get("altText", existingAlt or "Image")))
caption = html.escape(str(matchingImage.get("caption", ""))) if matchingImage.get("caption") else ""
# Create new img tag with filename
imgTag = f'<img src="{filename}" alt="{altText}">'
if caption:
return f'<figure>{imgTag}<figcaption>{caption}</figcaption></figure>'
else:
return imgTag
else:
# Keep original if no match found
return match.group(0)
# Replace all img tags with data URIs (auch IMAGE_MARKER Kommentare entfernen)
updatedHtml = re.sub(imgTagPattern, replaceImgTag, htmlContent)
# Entferne IMAGE_MARKER Kommentare die übrig geblieben sind
updatedHtml = re.sub(r'<!--IMAGE_MARKER:[^>]+-->', '', updatedHtml)
return updatedHtml
except Exception as e:
self.logger.warning(f"Error replacing image data URIs: {str(e)}")
return htmlContent # Return original if replacement fails
def getRenderedImages(self) -> List[Dict[str, Any]]:
"""
Get images that were extracted during rendering.
Returns list of image dicts with base64Data, altText, caption, and filename.
"""
if not hasattr(self, '_renderedImages'):
return []
return self._renderedImages