# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ HTML renderer for report generation. """ from .documentRendererBaseTemplate import BaseRenderer from modules.datamodels.datamodelDocument import RenderedDocument from typing import Dict, Any, List, Optional class RendererHtml(BaseRenderer): """Renders content to HTML format with format-specific extraction.""" @classmethod def getSupportedFormats(cls) -> List[str]: """Return supported HTML formats.""" return ['html', 'htm'] @classmethod def getFormatAliases(cls) -> List[str]: """Return format aliases.""" return ['web', 'webpage'] @classmethod def getPriority(cls) -> int: """Return priority for HTML renderer.""" return 100 @classmethod def getOutputStyle(cls, formatName: Optional[str] = None) -> str: """Return output style classification: HTML web pages are rendered documents.""" return 'document' @classmethod def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: """ Return list of section content types that HTML renderer accepts. HTML renderer accepts all section types (HTML pages can contain all content types including images). """ from modules.datamodels.datamodelJson import supportedSectionTypes return list(supportedSectionTypes) async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """ Render HTML document with images as separate files. Returns list of documents: [HTML document, image1, image2, ...] """ import base64 # Extract images first images = self._extractImages(extractedContent) # Store images in instance for later retrieval self._renderedImages = images # Generate HTML using AI-analyzed styling htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService) # Replace base64 data URIs with relative file paths if images exist if images: htmlContent = self._replaceImageDataUris(htmlContent, images) # Determine HTML filename from document or title documents = extractedContent.get("documents", []) if documents and isinstance(documents[0], dict): htmlFilename = documents[0].get("filename") if not htmlFilename: htmlFilename = self._determineFilename(title, "text/html") else: htmlFilename = self._determineFilename(title, "text/html") # Extract metadata for document type and other info metadata = extractedContent.get("metadata", {}) if extractedContent else {} documentType = metadata.get("documentType") if isinstance(metadata, dict) else None # Start with HTML document resultDocuments = [ RenderedDocument( documentData=htmlContent.encode('utf-8'), mimeType="text/html", filename=htmlFilename, documentType=documentType, metadata=metadata if isinstance(metadata, dict) else None ) ] # Add images as separate documents for img in images: base64Data = img.get("base64Data", "") filename = img.get("filename", f"image_{len(resultDocuments)}.png") mimeType = img.get("mimeType", "image/png") if base64Data: try: # Decode base64 to bytes imageBytes = base64.b64decode(base64Data) resultDocuments.append( RenderedDocument( documentData=imageBytes, mimeType=mimeType, filename=filename ) ) self.logger.debug(f"Added image file: {filename} ({len(imageBytes)} bytes)") except Exception as e: self.logger.warning(f"Error creating image file {filename}: {str(e)}") return resultDocuments async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str: """Generate HTML content from structured JSON document using AI-generated styling.""" try: # Get style set: use styles from metadata if available, otherwise enhance with AI styles = await self._getStyleSet(jsonContent, userPrompt, aiService) # Validate JSON structure if not self._validateJsonStructure(jsonContent): raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}") # Extract sections and metadata from standardized schema sections = self._extractSections(jsonContent) metadata = self._extractMetadata(jsonContent) # Use provided title (which comes from documents[].title) as primary source # Fallback to metadata.title only if title parameter is empty documentTitle = title if title else metadata.get("title", "Generated Document") # Build HTML document htmlParts = [] # HTML document structure htmlParts.append('') htmlParts.append('') htmlParts.append('') htmlParts.append('') htmlParts.append('') htmlParts.append(f'{documentTitle}') htmlParts.append('') htmlParts.append('') htmlParts.append('') # Document header htmlParts.append(f'

{documentTitle}

') # Main content htmlParts.append('
') # Process each section for section in sections: sectionHtml = self._renderJsonSection(section, styles) if sectionHtml: htmlParts.append(sectionHtml) htmlParts.append('
') # Footer htmlParts.append('') htmlParts.append('') htmlParts.append('') return '\n'.join(htmlParts) except Exception as e: self.logger.error(f"Error generating HTML from JSON: {str(e)}") raise Exception(f"HTML generation failed: {str(e)}") async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]: """Get style set - use styles from document generation metadata if available, otherwise enhance default styles with AI if userPrompt provided. WICHTIG: In a dynamic scalable AI system, styling should come from document generation, not be generated separately by renderers. Only fall back to AI if styles not provided. Args: extractedContent: Document content with metadata (may contain styles) userPrompt: User's prompt (AI will detect style instructions in any language) aiService: AI service (used only if styles not in metadata and userPrompt provided) templateName: Name of template style set (None = default) Returns: Dict with style definitions for all document styles """ # Get default style set defaultStyleSet = self._getDefaultStyleSet() # FIRST: Check if styles are provided in document generation metadata (preferred approach) if extractedContent: metadata = extractedContent.get("metadata", {}) if isinstance(metadata, dict): styles = metadata.get("styles") if styles and isinstance(styles, dict): self.logger.debug("Using styles from document generation metadata") return self._validateStylesContrast(styles) # FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata) if userPrompt and aiService: self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...") enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService) return self._validateStylesContrast(enhancedStyleSet) else: # Use default styles only return defaultStyleSet async def _enhanceStylesWithAI(self, userPrompt: str, defaultStyleSet: Dict[str, Any], aiService) -> Dict[str, Any]: """Enhance default styles with AI based on user prompt.""" try: style_template = self._createAiStyleTemplate("html", userPrompt, defaultStyleSet) enhanced_styles = await self._getAiStyles(aiService, style_template, defaultStyleSet) return enhanced_styles except Exception as e: self.logger.warning(f"AI style enhancement failed: {str(e)}, using default styles") return defaultStyleSet def _validateStylesContrast(self, styles: Dict[str, Any]) -> Dict[str, Any]: """Validate and fix contrast issues in AI-generated styles.""" try: # Fix table header contrast if "table_header" in styles: header = styles["table_header"] bgColor = header.get("background", "#FFFFFF") textColor = header.get("color", "#000000") # If both are white or both are dark, fix it if bgColor.upper() == "#FFFFFF" and textColor.upper() == "#FFFFFF": header["background"] = "#4F4F4F" header["color"] = "#FFFFFF" elif bgColor.upper() == "#000000" and textColor.upper() == "#000000": header["background"] = "#4F4F4F" header["color"] = "#FFFFFF" # Fix table cell contrast if "table_cell" in styles: cell = styles["table_cell"] bgColor = cell.get("background", "#FFFFFF") textColor = cell.get("color", "#000000") # If both are white or both are dark, fix it if bgColor.upper() == "#FFFFFF" and textColor.upper() == "#FFFFFF": cell["background"] = "#FFFFFF" cell["color"] = "#2F2F2F" elif bgColor.upper() == "#000000" and textColor.upper() == "#000000": cell["background"] = "#FFFFFF" cell["color"] = "#2F2F2F" return styles except Exception as e: self.logger.warning(f"Style validation failed: {str(e)}") return self._getDefaultStyleSet() def _getDefaultStyleSet(self) -> Dict[str, Any]: """Default HTML style set - used when no style instructions present.""" return { "title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"}, "heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"}, "heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"}, "paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"}, "table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"}, "table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"}, "table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"}, "bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"}, "code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"}, "image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"}, "body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"} } def _generateCssStyles(self, styles: Dict[str, Any]) -> str: """Generate CSS from style definitions.""" css_parts = [] # Body styles body_style = styles.get("body", {}) css_parts.append("body {") for property_name, value in body_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Document title title_style = styles.get("title", {}) css_parts.append(".document-title {") for property_name, value in title_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Headings for heading_level in ["heading1", "heading2"]: heading_style = styles.get(heading_level, {}) css_class = f"h{heading_level[-1]}" css_parts.append(f"{css_class} {{") for property_name, value in heading_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Paragraphs paragraph_style = styles.get("paragraph", {}) css_parts.append("p {") for property_name, value in paragraph_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Tables table_style = styles.get("table", {}) css_parts.append("table {") for property_name, value in table_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Table headers table_header_style = styles.get("table_header", {}) css_parts.append("th {") for property_name, value in table_header_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Table cells table_cell_style = styles.get("table_cell", {}) css_parts.append("td {") for property_name, value in table_cell_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Lists bullet_list_style = styles.get("bullet_list", {}) css_parts.append("ul {") for property_name, value in bullet_list_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Code blocks code_block_style = styles.get("code_block", {}) css_parts.append("pre {") for property_name, value in code_block_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Images image_style = styles.get("image", {}) css_parts.append("img {") for property_name, value in image_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Generated info css_parts.append(".generated-info {") css_parts.append(" font-size: 0.9em;") css_parts.append(" color: #666;") css_parts.append(" text-align: center;") css_parts.append(" margin-top: 2em;") css_parts.append(" padding-top: 1em;") css_parts.append(" border-top: 1px solid #ddd;") css_parts.append("}") return '\n'.join(css_parts) def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a single JSON section to HTML using AI-generated styles. Supports three content formats: reference, object (base64), extracted_text. WICHTIG: Respektiert sectionType (content_type) für korrekte Rendering-Logik. """ try: sectionType = self._getSectionType(section) sectionData = self._getSectionData(section) # WICHTIG: Respektiere sectionType (content_type) ZUERST, dann process elements entsprechend # Process elements according to section's content_type, not just element types if sectionType == "table": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonTable(element, styles) return "" elif sectionType == "bullet_list": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonBulletList(element, styles) return "" elif sectionType == "heading": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonHeading(element, styles) return "" elif sectionType == "paragraph": # Process paragraph elements, including extracted_text if isinstance(sectionData, list): htmlParts = [] for element in sectionData: element_type = element.get("type", "") if isinstance(element, dict) else "" if element_type == "reference": doc_ref = element.get("documentReference", "") label = element.get("label", "Reference") htmlParts.append(f'

[Reference: {label}]

') elif element_type == "extracted_text": content = element.get("content", "") source = element.get("source", "") if content: source_text = f' (Source: {source})' if source else '' htmlParts.append(f'

{content}{source_text}

') elif isinstance(element, dict): # Regular paragraph element - extract from nested content structure (standard JSON format) content = element.get("content", {}) if isinstance(content, dict): text = content.get("text", "") elif isinstance(content, str): text = content else: text = "" if text: htmlParts.append(f'

{text}

') elif isinstance(element, str): htmlParts.append(f'

{element}

') if htmlParts: return '\n'.join(htmlParts) # If sectionData is not a list, treat it as a dict if isinstance(sectionData, dict): return self._renderJsonParagraph(sectionData, styles) return "" elif sectionType == "code_block": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonCodeBlock(element, styles) return "" elif sectionType == "image": # Work directly with elements like other renderers if isinstance(sectionData, list) and sectionData: element = sectionData[0] if isinstance(sectionData[0], dict) else {} return self._renderJsonImage(element, styles) return "" else: # Fallback: Check for special element types first if isinstance(sectionData, list): htmlParts = [] for element in sectionData: element_type = element.get("type", "") if isinstance(element, dict) else "" if element_type == "reference": doc_ref = element.get("documentReference", "") label = element.get("label", "Reference") htmlParts.append(f'

[Reference: {label}]

') elif element_type == "extracted_text": content = element.get("content", "") source = element.get("source", "") if content: source_text = f' (Source: {source})' if source else '' htmlParts.append(f'

{content}{source_text}

') if htmlParts: return '\n'.join(htmlParts) # Fallback to paragraph for unknown types if isinstance(sectionData, dict): return self._renderJsonParagraph(sectionData, styles) return "" except Exception as e: self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}") return f'
[Error rendering section: {str(e)}]
' def _renderJsonTable(self, tableData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON table to HTML using AI-generated styles.""" try: # Extract from nested content structure: element.content.{headers, rows} content = tableData.get("content", {}) if not isinstance(content, dict): return "" headers = content.get("headers", []) rows = content.get("rows", []) if not headers or not rows: return "" htmlParts = [''] # Table header htmlParts.append('') for header in headers: htmlParts.append(f'') htmlParts.append('') # Table body htmlParts.append('') for row in rows: htmlParts.append('') for cellData in row: htmlParts.append(f'') htmlParts.append('') htmlParts.append('') htmlParts.append('
{header}
{cellData}
') return '\n'.join(htmlParts) except Exception as e: self.logger.warning(f"Error rendering table: {str(e)}") return "" def _renderJsonBulletList(self, listData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON bullet list to HTML using AI-generated styles.""" try: # Extract from nested content structure: element.content.{items} content = listData.get("content", {}) if not isinstance(content, dict): return "" items = content.get("items", []) if not items: return "" htmlParts = ['') return '\n'.join(htmlParts) except Exception as e: self.logger.warning(f"Error rendering bullet list: {str(e)}") return "" def _renderJsonHeading(self, headingData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON heading to HTML using AI-generated styles.""" try: # Extract from nested content structure: element.content.{text, level} content = headingData.get("content", {}) if not isinstance(content, dict): return "" text = content.get("text", "") level = content.get("level", 1) if text: level = max(1, min(6, level)) return f'{text}' return "" except Exception as e: self.logger.warning(f"Error rendering heading: {str(e)}") return "" def _renderJsonParagraph(self, paragraphData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON paragraph to HTML using AI-generated styles.""" try: # Normalize inputs - paragraphData is typically a list of elements from _getSectionData if isinstance(paragraphData, list): # Extract text from all paragraph elements (expects nested content structure) texts = [] for el in paragraphData: if isinstance(el, dict): content = el.get("content", {}) if isinstance(content, dict): text = content.get("text", "") elif isinstance(content, str): text = content else: text = "" if text: texts.append(text) elif isinstance(el, str): texts.append(el) if texts: # Join multiple paragraphs with

tags return '\n'.join(f'

{text}

' for text in texts) return "" elif isinstance(paragraphData, str): return f'

{paragraphData}

' elif isinstance(paragraphData, dict): # Handle nested content structure: element.content vs element.text # Extract from nested content structure content = paragraphData.get("content", {}) if isinstance(content, dict): text = content.get("text", "") elif isinstance(content, str): text = content else: text = "" if text: return f'

{text}

' return "" else: return "" except Exception as e: self.logger.warning(f"Error rendering paragraph: {str(e)}") return "" def _renderJsonCodeBlock(self, codeData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON code block to HTML using AI-generated styles.""" try: # Extract from nested content structure: element.content.{code, language} content = codeData.get("content", {}) if not isinstance(content, dict): return "" code = content.get("code", "") language = content.get("language", "") if code: if language: return f'
{code}
' else: return f'
{code}
' return "" except Exception as e: self.logger.warning(f"Error rendering code block: {str(e)}") return "" def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON image to HTML with placeholder for later replacement. Expects nested content structure.""" try: import html # Extract from nested content structure (standard JSON format) content = imageData.get("content", {}) if not isinstance(content, dict): return "" base64Data = content.get("base64Data", "") altText = content.get("altText", "Image") caption = content.get("caption", "") # Escape HTML in altText and caption to prevent injection altTextEscaped = html.escape(str(altText)) captionEscaped = html.escape(str(caption)) if caption else "" if base64Data: # Use data URI as placeholder - will be replaced with file path in _replaceImageDataUris # Include a marker so we can find and replace it imageMarker = f"" # Add max-width and max-height to ensure image fits within page dimensions # Typical page width is ~800-1200px, height varies but we limit to 600px for readability imgTag = f'{altTextEscaped}' if captionEscaped: return f'{imageMarker}
{imgTag}
{captionEscaped}
' else: return f'{imageMarker}{imgTag}' return "" except Exception as e: self.logger.error(f"Error embedding image in HTML: {str(e)}") altText = imageData.get("altText", "Image") errorMsg = html.escape(f"[Error: Could not embed image '{altText}'. {str(e)}]") return f'
{errorMsg}
' def _extractImages(self, jsonContent: Dict[str, Any]) -> List[Dict[str, Any]]: """ Extract all images from JSON structure. Returns: List of image data dictionaries with base64Data, altText, caption, sectionId """ images = [] try: # Extract from standardized schema: {metadata: {...}, documents: [{sections: [...]}]} documents = jsonContent.get("documents", []) if not documents or not isinstance(documents, list): return images for doc in documents: if not isinstance(doc, dict): continue sections = doc.get("sections", []) for section in sections: if section.get("content_type") == "image": elements = section.get("elements", []) for element in elements: # Extract from nested content structure content = element.get("content", {}) base64Data = "" if isinstance(content, dict): base64Data = content.get("base64Data", "") elif isinstance(content, str): # Content might be base64 string directly (shouldn't happen) pass # If base64Data not found in content, try direct element fields (fallback) if not base64Data: base64Data = element.get("base64Data", "") # If base64Data still not found, try extracting from url data URI if not base64Data: url = element.get("url", "") or (content.get("url", "") if isinstance(content, dict) else "") if url and isinstance(url, str) and url.startswith("data:image/"): # Extract base64 from data URI: data:image/png;base64, import re match = re.match(r'data:image/[^;]+;base64,(.+)', url) if match: base64Data = match.group(1) if base64Data: sectionId = section.get("id", "unknown") # Bestimme MIME-Type und Extension mimeType = element.get("mimeType", "") or (content.get("mimeType", "") if isinstance(content, dict) else "") if not mimeType or mimeType == "unknown": # Versuche MIME-Type aus base64 zu erkennen if base64Data.startswith("/9j/"): mimeType = "image/jpeg" elif base64Data.startswith("iVBORw0KGgo"): mimeType = "image/png" else: mimeType = "image/png" # Default # Bestimme Extension basierend auf MIME-Type extension = "png" if mimeType == "image/jpeg" or mimeType == "image/jpg": extension = "jpg" elif mimeType == "image/png": extension = "png" elif mimeType == "image/gif": extension = "gif" elif mimeType == "image/webp": extension = "webp" # Generate filename from section ID filename = f"{sectionId}.{extension}" # Clean filename (remove invalid characters) filename = "".join(c if c.isalnum() or c in "._-" else "_" for c in filename) images.append({ "base64Data": base64Data, "altText": element.get("altText", "Image"), "caption": element.get("caption"), "sectionId": sectionId, "filename": filename, "mimeType": mimeType }) self.logger.debug(f"Extracted image from section {sectionId}: {filename}") self.logger.info(f"Extracted {len(images)} image(s) from JSON structure") return images except Exception as e: self.logger.warning(f"Error extracting images: {str(e)}") return [] def _replaceImageDataUris(self, htmlContent: str, images: List[Dict[str, Any]]) -> str: """ Replace base64 data URIs in HTML with relative file paths. Args: htmlContent: HTML content with data URIs images: List of image data dictionaries Returns: HTML content with relative file paths """ try: import base64 import re # Find entire img tags with data URIs and replace them # Pattern: imgTagPattern = r']*>' def replaceImgTag(match): imgTag = match.group(0) # Extract base64 data from the img tag base64Match = re.search(r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)', imgTag) if not base64Match: return imgTag # Return original if no base64 found base64Data = base64Match.group(1) # Find matching image in images list matchingImage = None for img in images: imgBase64 = img.get("base64Data", "") # Vergleiche base64-Daten (kann unterschiedliche Längen haben durch Padding) if imgBase64 == base64Data or imgBase64.startswith(base64Data[:100]) or base64Data.startswith(imgBase64[:100]): matchingImage = img break if matchingImage: import html # Use filename from image data (generated from section ID) filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png") # Extract existing alt text or use from matchingImage altMatch = re.search(r'alt="([^"]*)"', imgTag) existingAlt = altMatch.group(1) if altMatch else "" altText = html.escape(str(matchingImage.get("altText", existingAlt or "Image"))) caption = html.escape(str(matchingImage.get("caption", ""))) if matchingImage.get("caption") else "" # Create new img tag with filename imgTag = f'{altText}' if caption: return f'
{imgTag}
{caption}
' else: return imgTag else: # Keep original if no match found return match.group(0) # Replace all img tags with data URIs (auch IMAGE_MARKER Kommentare entfernen) updatedHtml = re.sub(imgTagPattern, replaceImgTag, htmlContent) # Entferne IMAGE_MARKER Kommentare die übrig geblieben sind updatedHtml = re.sub(r'', '', updatedHtml) return updatedHtml except Exception as e: self.logger.warning(f"Error replacing image data URIs: {str(e)}") return htmlContent # Return original if replacement fails def getRenderedImages(self) -> List[Dict[str, Any]]: """ Get images that were extracted during rendering. Returns list of image dicts with base64Data, altText, caption, and filename. """ if not hasattr(self, '_renderedImages'): return [] return self._renderedImages