gateway/modules/serviceCenter/services/serviceGeneration/renderers/rendererHtml.py
2026-03-06 14:03:18 +01:00

841 lines
40 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
HTML renderer for report generation.
"""
from .documentRendererBaseTemplate import BaseRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional
class RendererHtml(BaseRenderer):
"""Renders content to HTML format with format-specific extraction."""
@classmethod
def getSupportedFormats(cls) -> List[str]:
"""Return supported HTML formats."""
return ['html', 'htm']
@classmethod
def getFormatAliases(cls) -> List[str]:
"""Return format aliases."""
return ['web', 'webpage']
@classmethod
def getPriority(cls) -> int:
"""Return priority for HTML renderer."""
return 100
@classmethod
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
"""Return output style classification: HTML web pages are rendered documents."""
return 'document'
@classmethod
def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]:
"""
Return list of section content types that HTML renderer accepts.
HTML renderer accepts all section types (HTML pages can contain all content types including images).
"""
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
"""
Render HTML document with images as separate files.
Returns list of documents: [HTML document, image1, image2, ...]
"""
import base64
# Extract images first
images = self._extractImages(extractedContent)
# Store images in instance for later retrieval
self._renderedImages = images
# Generate HTML using AI-analyzed styling
htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService)
# Replace base64 data URIs with relative file paths if images exist
if images:
htmlContent = self._replaceImageDataUris(htmlContent, images)
# Determine HTML filename from document or title
documents = extractedContent.get("documents", [])
if documents and isinstance(documents[0], dict):
htmlFilename = documents[0].get("filename")
if not htmlFilename:
htmlFilename = self._determineFilename(title, "text/html")
else:
htmlFilename = self._determineFilename(title, "text/html")
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
# Start with HTML document
resultDocuments = [
RenderedDocument(
documentData=htmlContent.encode('utf-8'),
mimeType="text/html",
filename=htmlFilename,
documentType=documentType,
metadata=metadata if isinstance(metadata, dict) else None
)
]
# Add images as separate documents
for img in images:
base64Data = img.get("base64Data", "")
filename = img.get("filename", f"image_{len(resultDocuments)}.png")
mimeType = img.get("mimeType", "image/png")
if base64Data:
try:
# Decode base64 to bytes
imageBytes = base64.b64decode(base64Data)
resultDocuments.append(
RenderedDocument(
documentData=imageBytes,
mimeType=mimeType,
filename=filename
)
)
self.logger.debug(f"Added image file: {filename} ({len(imageBytes)} bytes)")
except Exception as e:
self.logger.warning(f"Error creating image file {filename}: {str(e)}")
return resultDocuments
async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
"""Generate HTML content from structured JSON document using AI-generated styling."""
try:
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
# Validate JSON structure
if not self._validateJsonStructure(jsonContent):
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
# Extract sections and metadata from standardized schema
sections = self._extractSections(jsonContent)
metadata = self._extractMetadata(jsonContent)
# Use provided title (which comes from documents[].title) as primary source
# Fallback to metadata.title only if title parameter is empty
documentTitle = title if title else metadata.get("title", "Generated Document")
# Build HTML document
htmlParts = []
# HTML document structure
htmlParts.append('<!DOCTYPE html>')
htmlParts.append('<html lang="en">')
htmlParts.append('<head>')
htmlParts.append('<meta charset="UTF-8">')
htmlParts.append('<meta name="viewport" content="width=device-width, initial-scale=1.0">')
htmlParts.append(f'<title>{documentTitle}</title>')
htmlParts.append('<style>')
htmlParts.append(self._generateCssStyles(styles))
htmlParts.append('</style>')
htmlParts.append('</head>')
htmlParts.append('<body>')
# Document header
htmlParts.append(f'<header><h1 class="document-title">{documentTitle}</h1></header>')
# Main content
htmlParts.append('<main>')
# Process each section
for section in sections:
sectionHtml = self._renderJsonSection(section, styles)
if sectionHtml:
htmlParts.append(sectionHtml)
htmlParts.append('</main>')
# Footer
htmlParts.append('<footer>')
htmlParts.append(f'<p class="generated-info">Generated: {self._formatTimestamp()}</p>')
htmlParts.append('</footer>')
htmlParts.append('</body>')
htmlParts.append('</html>')
return '\n'.join(htmlParts)
except Exception as e:
self.logger.error(f"Error generating HTML from JSON: {str(e)}")
raise Exception(f"HTML generation failed: {str(e)}")
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
"""Get style set - use styles from document generation metadata if available,
otherwise enhance default styles with AI if userPrompt provided.
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
not be generated separately by renderers. Only fall back to AI if styles not provided.
Args:
extractedContent: Document content with metadata (may contain styles)
userPrompt: User's prompt (AI will detect style instructions in any language)
aiService: AI service (used only if styles not in metadata and userPrompt provided)
templateName: Name of template style set (None = default)
Returns:
Dict with style definitions for all document styles
"""
# Get default style set
defaultStyleSet = self._getDefaultStyleSet()
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
if extractedContent:
metadata = extractedContent.get("metadata", {})
if isinstance(metadata, dict):
styles = metadata.get("styles")
if styles and isinstance(styles, dict):
self.logger.debug("Using styles from document generation metadata")
return self._validateStylesContrast(styles)
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
if userPrompt and aiService:
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
return self._validateStylesContrast(enhancedStyleSet)
else:
# Use default styles only
return defaultStyleSet
async def _enhanceStylesWithAI(self, userPrompt: str, defaultStyleSet: Dict[str, Any], aiService) -> Dict[str, Any]:
"""Enhance default styles with AI based on user prompt."""
try:
style_template = self._createAiStyleTemplate("html", userPrompt, defaultStyleSet)
enhanced_styles = await self._getAiStyles(aiService, style_template, defaultStyleSet)
return enhanced_styles
except Exception as e:
self.logger.warning(f"AI style enhancement failed: {str(e)}, using default styles")
return defaultStyleSet
def _validateStylesContrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Validate and fix contrast issues in AI-generated styles."""
try:
# Fix table header contrast
if "table_header" in styles:
header = styles["table_header"]
bgColor = header.get("background", "#FFFFFF")
textColor = header.get("color", "#000000")
# If both are white or both are dark, fix it
if bgColor.upper() == "#FFFFFF" and textColor.upper() == "#FFFFFF":
header["background"] = "#4F4F4F"
header["color"] = "#FFFFFF"
elif bgColor.upper() == "#000000" and textColor.upper() == "#000000":
header["background"] = "#4F4F4F"
header["color"] = "#FFFFFF"
# Fix table cell contrast
if "table_cell" in styles:
cell = styles["table_cell"]
bgColor = cell.get("background", "#FFFFFF")
textColor = cell.get("color", "#000000")
# If both are white or both are dark, fix it
if bgColor.upper() == "#FFFFFF" and textColor.upper() == "#FFFFFF":
cell["background"] = "#FFFFFF"
cell["color"] = "#2F2F2F"
elif bgColor.upper() == "#000000" and textColor.upper() == "#000000":
cell["background"] = "#FFFFFF"
cell["color"] = "#2F2F2F"
return styles
except Exception as e:
self.logger.warning(f"Style validation failed: {str(e)}")
return self._getDefaultStyleSet()
def _getDefaultStyleSet(self) -> Dict[str, Any]:
"""Default HTML style set - used when no style instructions present."""
return {
"title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"},
"heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"},
"heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"},
"paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"},
"table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"},
"table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"},
"table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"},
"bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"},
"code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"},
"image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"},
"body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"}
}
def _generateCssStyles(self, styles: Dict[str, Any]) -> str:
"""Generate CSS from style definitions."""
css_parts = []
# Body styles
body_style = styles.get("body", {})
css_parts.append("body {")
for property_name, value in body_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Document title
title_style = styles.get("title", {})
css_parts.append(".document-title {")
for property_name, value in title_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Headings
for heading_level in ["heading1", "heading2"]:
heading_style = styles.get(heading_level, {})
css_class = f"h{heading_level[-1]}"
css_parts.append(f"{css_class} {{")
for property_name, value in heading_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Paragraphs
paragraph_style = styles.get("paragraph", {})
css_parts.append("p {")
for property_name, value in paragraph_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Tables
table_style = styles.get("table", {})
css_parts.append("table {")
for property_name, value in table_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Table headers
table_header_style = styles.get("table_header", {})
css_parts.append("th {")
for property_name, value in table_header_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Table cells
table_cell_style = styles.get("table_cell", {})
css_parts.append("td {")
for property_name, value in table_cell_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Lists
bullet_list_style = styles.get("bullet_list", {})
css_parts.append("ul {")
for property_name, value in bullet_list_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Code blocks
code_block_style = styles.get("code_block", {})
css_parts.append("pre {")
for property_name, value in code_block_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Images
image_style = styles.get("image", {})
css_parts.append("img {")
for property_name, value in image_style.items():
css_property = property_name.replace("_", "-")
css_parts.append(f" {css_property}: {value};")
css_parts.append("}")
# Generated info
css_parts.append(".generated-info {")
css_parts.append(" font-size: 0.9em;")
css_parts.append(" color: #666;")
css_parts.append(" text-align: center;")
css_parts.append(" margin-top: 2em;")
css_parts.append(" padding-top: 1em;")
css_parts.append(" border-top: 1px solid #ddd;")
css_parts.append("}")
return '\n'.join(css_parts)
def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a single JSON section to HTML using AI-generated styles.
Supports three content formats: reference, object (base64), extracted_text.
WICHTIG: Respektiert sectionType (content_type) für korrekte Rendering-Logik.
"""
try:
sectionType = self._getSectionType(section)
sectionData = self._getSectionData(section)
# WICHTIG: Respektiere sectionType (content_type) ZUERST, dann process elements entsprechend
# Process elements according to section's content_type, not just element types
if sectionType == "table":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonTable(element, styles)
return ""
elif sectionType == "bullet_list":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonBulletList(element, styles)
return ""
elif sectionType == "heading":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonHeading(element, styles)
return ""
elif sectionType == "paragraph":
# Process paragraph elements, including extracted_text
if isinstance(sectionData, list):
htmlParts = []
for element in sectionData:
element_type = element.get("type", "") if isinstance(element, dict) else ""
if element_type == "reference":
doc_ref = element.get("documentReference", "")
label = element.get("label", "Reference")
htmlParts.append(f'<p class="reference"><em>[Reference: {label}]</em></p>')
elif element_type == "extracted_text":
content = element.get("content", "")
source = element.get("source", "")
if content:
source_text = f' <small><em>(Source: {source})</em></small>' if source else ''
htmlParts.append(f'<p>{content}{source_text}</p>')
elif isinstance(element, dict):
# Regular paragraph element - extract from nested content structure (standard JSON format)
content = element.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
htmlParts.append(f'<p>{text}</p>')
elif isinstance(element, str):
htmlParts.append(f'<p>{element}</p>')
if htmlParts:
return '\n'.join(htmlParts)
# If sectionData is not a list, treat it as a dict
if isinstance(sectionData, dict):
return self._renderJsonParagraph(sectionData, styles)
return ""
elif sectionType == "code_block":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonCodeBlock(element, styles)
return ""
elif sectionType == "image":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonImage(element, styles)
return ""
else:
# Fallback: Check for special element types first
if isinstance(sectionData, list):
htmlParts = []
for element in sectionData:
element_type = element.get("type", "") if isinstance(element, dict) else ""
if element_type == "reference":
doc_ref = element.get("documentReference", "")
label = element.get("label", "Reference")
htmlParts.append(f'<p class="reference"><em>[Reference: {label}]</em></p>')
elif element_type == "extracted_text":
content = element.get("content", "")
source = element.get("source", "")
if content:
source_text = f' <small><em>(Source: {source})</em></small>' if source else ''
htmlParts.append(f'<p>{content}{source_text}</p>')
if htmlParts:
return '\n'.join(htmlParts)
# Fallback to paragraph for unknown types
if isinstance(sectionData, dict):
return self._renderJsonParagraph(sectionData, styles)
return ""
except Exception as e:
self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}")
return f'<div class="error">[Error rendering section: {str(e)}]</div>'
def _renderJsonTable(self, tableData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON table to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{headers, rows}
content = tableData.get("content", {})
if not isinstance(content, dict):
return ""
headers = content.get("headers", [])
rows = content.get("rows", [])
if not headers or not rows:
return ""
htmlParts = ['<table>']
# Table header
htmlParts.append('<thead><tr>')
for header in headers:
htmlParts.append(f'<th>{header}</th>')
htmlParts.append('</tr></thead>')
# Table body
htmlParts.append('<tbody>')
for row in rows:
htmlParts.append('<tr>')
for cellData in row:
htmlParts.append(f'<td>{cellData}</td>')
htmlParts.append('</tr>')
htmlParts.append('</tbody>')
htmlParts.append('</table>')
return '\n'.join(htmlParts)
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return ""
def _renderJsonBulletList(self, listData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON bullet list to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{items}
content = listData.get("content", {})
if not isinstance(content, dict):
return ""
items = content.get("items", [])
if not items:
return ""
htmlParts = ['<ul>']
for item in items:
if isinstance(item, str):
htmlParts.append(f'<li>{item}</li>')
elif isinstance(item, dict) and "text" in item:
htmlParts.append(f'<li>{item["text"]}</li>')
htmlParts.append('</ul>')
return '\n'.join(htmlParts)
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
return ""
def _renderJsonHeading(self, headingData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON heading to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{text, level}
content = headingData.get("content", {})
if not isinstance(content, dict):
return ""
text = content.get("text", "")
level = content.get("level", 1)
if text:
level = max(1, min(6, level))
return f'<h{level}>{text}</h{level}>'
return ""
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
return ""
def _renderJsonParagraph(self, paragraphData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON paragraph to HTML using AI-generated styles."""
try:
# Normalize inputs - paragraphData is typically a list of elements from _getSectionData
if isinstance(paragraphData, list):
# Extract text from all paragraph elements (expects nested content structure)
texts = []
for el in paragraphData:
if isinstance(el, dict):
content = el.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
texts.append(text)
elif isinstance(el, str):
texts.append(el)
if texts:
# Join multiple paragraphs with <p> tags
return '\n'.join(f'<p>{text}</p>' for text in texts)
return ""
elif isinstance(paragraphData, str):
return f'<p>{paragraphData}</p>'
elif isinstance(paragraphData, dict):
# Handle nested content structure: element.content vs element.text
# Extract from nested content structure
content = paragraphData.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
return f'<p>{text}</p>'
return ""
else:
return ""
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return ""
def _renderJsonCodeBlock(self, codeData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON code block to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{code, language}
content = codeData.get("content", {})
if not isinstance(content, dict):
return ""
code = content.get("code", "")
language = content.get("language", "")
if code:
if language:
return f'<pre><code class="language-{language}">{code}</code></pre>'
else:
return f'<pre><code>{code}</code></pre>'
return ""
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
return ""
def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON image to HTML with placeholder for later replacement. Expects nested content structure."""
try:
import html
# Extract from nested content structure (standard JSON format)
content = imageData.get("content", {})
if not isinstance(content, dict):
return ""
base64Data = content.get("base64Data", "")
altText = content.get("altText", "Image")
caption = content.get("caption", "")
# Escape HTML in altText and caption to prevent injection
altTextEscaped = html.escape(str(altText))
captionEscaped = html.escape(str(caption)) if caption else ""
if base64Data:
# Use data URI as placeholder - will be replaced with file path in _replaceImageDataUris
# Include a marker so we can find and replace it
imageMarker = f"<!--IMAGE_MARKER:{len(base64Data)}:{altTextEscaped[:50]}-->"
# Add max-width and max-height to ensure image fits within page dimensions
# Typical page width is ~800-1200px, height varies but we limit to 600px for readability
imgTag = f'<img src="data:image/png;base64,{base64Data}" alt="{altTextEscaped}" style="max-width: 100%; max-height: 600px; width: auto; height: auto;">'
if captionEscaped:
return f'{imageMarker}<figure>{imgTag}<figcaption>{captionEscaped}</figcaption></figure>'
else:
return f'{imageMarker}{imgTag}'
return ""
except Exception as e:
self.logger.error(f"Error embedding image in HTML: {str(e)}")
altText = imageData.get("altText", "Image")
errorMsg = html.escape(f"[Error: Could not embed image '{altText}'. {str(e)}]")
return f'<div class="error" style="color: red; padding: 10px; border: 1px solid red;">{errorMsg}</div>'
def _extractImages(self, jsonContent: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Extract all images from JSON structure.
Returns:
List of image data dictionaries with base64Data, altText, caption, sectionId
"""
images = []
try:
# Extract from standardized schema: {metadata: {...}, documents: [{sections: [...]}]}
documents = jsonContent.get("documents", [])
if not documents or not isinstance(documents, list):
return images
for doc in documents:
if not isinstance(doc, dict):
continue
sections = doc.get("sections", [])
for section in sections:
if section.get("content_type") == "image":
elements = section.get("elements", [])
for element in elements:
# Extract from nested content structure
content = element.get("content", {})
base64Data = ""
if isinstance(content, dict):
base64Data = content.get("base64Data", "")
elif isinstance(content, str):
# Content might be base64 string directly (shouldn't happen)
pass
# If base64Data not found in content, try direct element fields (fallback)
if not base64Data:
base64Data = element.get("base64Data", "")
# If base64Data still not found, try extracting from url data URI
if not base64Data:
url = element.get("url", "") or (content.get("url", "") if isinstance(content, dict) else "")
if url and isinstance(url, str) and url.startswith("data:image/"):
# Extract base64 from data URI: data:image/png;base64,<base64>
import re
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
if match:
base64Data = match.group(1)
if base64Data:
sectionId = section.get("id", "unknown")
# Bestimme MIME-Type und Extension
mimeType = element.get("mimeType", "") or (content.get("mimeType", "") if isinstance(content, dict) else "")
if not mimeType or mimeType == "unknown":
# Versuche MIME-Type aus base64 zu erkennen
if base64Data.startswith("/9j/"):
mimeType = "image/jpeg"
elif base64Data.startswith("iVBORw0KGgo"):
mimeType = "image/png"
else:
mimeType = "image/png" # Default
# Bestimme Extension basierend auf MIME-Type
extension = "png"
if mimeType == "image/jpeg" or mimeType == "image/jpg":
extension = "jpg"
elif mimeType == "image/png":
extension = "png"
elif mimeType == "image/gif":
extension = "gif"
elif mimeType == "image/webp":
extension = "webp"
# Generate filename from section ID
filename = f"{sectionId}.{extension}"
# Clean filename (remove invalid characters)
filename = "".join(c if c.isalnum() or c in "._-" else "_" for c in filename)
images.append({
"base64Data": base64Data,
"altText": element.get("altText", "Image"),
"caption": element.get("caption"),
"sectionId": sectionId,
"filename": filename,
"mimeType": mimeType
})
self.logger.debug(f"Extracted image from section {sectionId}: {filename}")
self.logger.info(f"Extracted {len(images)} image(s) from JSON structure")
return images
except Exception as e:
self.logger.warning(f"Error extracting images: {str(e)}")
return []
def _replaceImageDataUris(self, htmlContent: str, images: List[Dict[str, Any]]) -> str:
"""
Replace base64 data URIs in HTML with relative file paths.
Args:
htmlContent: HTML content with data URIs
images: List of image data dictionaries
Returns:
HTML content with relative file paths
"""
try:
import base64
import re
# Find entire img tags with data URIs and replace them
# Pattern: <img src="data:image/[type];base64,<base64>" [other attributes]>
imgTagPattern = r'<img\s+src="data:image/[^"]+"[^>]*>'
def replaceImgTag(match):
imgTag = match.group(0)
# Extract base64 data from the img tag
base64Match = re.search(r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)', imgTag)
if not base64Match:
return imgTag # Return original if no base64 found
base64Data = base64Match.group(1)
# Find matching image in images list
matchingImage = None
for img in images:
imgBase64 = img.get("base64Data", "")
# Vergleiche base64-Daten (kann unterschiedliche Längen haben durch Padding)
if imgBase64 == base64Data or imgBase64.startswith(base64Data[:100]) or base64Data.startswith(imgBase64[:100]):
matchingImage = img
break
if matchingImage:
import html
# Use filename from image data (generated from section ID)
filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png")
# Extract existing alt text or use from matchingImage
altMatch = re.search(r'alt="([^"]*)"', imgTag)
existingAlt = altMatch.group(1) if altMatch else ""
altText = html.escape(str(matchingImage.get("altText", existingAlt or "Image")))
caption = html.escape(str(matchingImage.get("caption", ""))) if matchingImage.get("caption") else ""
# Create new img tag with filename
imgTag = f'<img src="{filename}" alt="{altText}">'
if caption:
return f'<figure>{imgTag}<figcaption>{caption}</figcaption></figure>'
else:
return imgTag
else:
# Keep original if no match found
return match.group(0)
# Replace all img tags with data URIs (auch IMAGE_MARKER Kommentare entfernen)
updatedHtml = re.sub(imgTagPattern, replaceImgTag, htmlContent)
# Entferne IMAGE_MARKER Kommentare die übrig geblieben sind
updatedHtml = re.sub(r'<!--IMAGE_MARKER:[^>]+-->', '', updatedHtml)
return updatedHtml
except Exception as e:
self.logger.warning(f"Error replacing image data URIs: {str(e)}")
return htmlContent # Return original if replacement fails
def getRenderedImages(self) -> List[Dict[str, Any]]:
"""
Get images that were extracted during rendering.
Returns list of image dicts with base64Data, altText, caption, and filename.
"""
if not hasattr(self, '_renderedImages'):
return []
return self._renderedImages