# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ HTML renderer for report generation. """ from .rendererBaseTemplate import BaseRenderer from modules.datamodels.datamodelDocument import RenderedDocument from typing import Dict, Any, List class RendererHtml(BaseRenderer): """Renders content to HTML format with format-specific extraction.""" @classmethod def getSupportedFormats(cls) -> List[str]: """Return supported HTML formats.""" return ['html', 'htm'] @classmethod def getFormatAliases(cls) -> List[str]: """Return format aliases.""" return ['web', 'webpage'] @classmethod def getPriority(cls) -> int: """Return priority for HTML renderer.""" return 100 async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """ Render HTML document with images as separate files. Returns list of documents: [HTML document, image1, image2, ...] """ import base64 # Extract images first images = self._extractImages(extractedContent) # Store images in instance for later retrieval self._renderedImages = images # Generate HTML using AI-analyzed styling htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService) # Replace base64 data URIs with relative file paths if images exist if images: htmlContent = self._replaceImageDataUris(htmlContent, images) # Determine HTML filename from document or title documents = extractedContent.get("documents", []) if documents and isinstance(documents[0], dict): htmlFilename = documents[0].get("filename") if not htmlFilename: htmlFilename = self._determineFilename(title, "text/html") else: htmlFilename = self._determineFilename(title, "text/html") # Start with HTML document resultDocuments = [ RenderedDocument( documentData=htmlContent.encode('utf-8'), mimeType="text/html", filename=htmlFilename ) ] # Add images as separate documents for img in images: base64Data = img.get("base64Data", "") filename = img.get("filename", f"image_{len(resultDocuments)}.png") mimeType = img.get("mimeType", "image/png") if base64Data: try: # Decode base64 to bytes imageBytes = base64.b64decode(base64Data) resultDocuments.append( RenderedDocument( documentData=imageBytes, mimeType=mimeType, filename=filename ) ) self.logger.debug(f"Added image file: {filename} ({len(imageBytes)} bytes)") except Exception as e: self.logger.warning(f"Error creating image file {filename}: {str(e)}") return resultDocuments async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str: """Generate HTML content from structured JSON document using AI-generated styling.""" try: # Get style set: default styles, enhanced with AI if userPrompt provided styles = await self._getStyleSet(userPrompt, aiService) # Validate JSON structure if not self._validateJsonStructure(jsonContent): raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}") # Extract sections and metadata from standardized schema sections = self._extractSections(jsonContent) metadata = self._extractMetadata(jsonContent) # Use title from JSON metadata if available, otherwise use provided title documentTitle = metadata.get("title", title) # Build HTML document htmlParts = [] # HTML document structure htmlParts.append('') htmlParts.append('') htmlParts.append('') htmlParts.append('') htmlParts.append('') htmlParts.append(f'{documentTitle}') htmlParts.append('') htmlParts.append('') htmlParts.append('') # Document header htmlParts.append(f'

{documentTitle}

') # Main content htmlParts.append('
') # Process each section for section in sections: sectionHtml = self._renderJsonSection(section, styles) if sectionHtml: htmlParts.append(sectionHtml) htmlParts.append('
') # Footer htmlParts.append('') htmlParts.append('') htmlParts.append('') return '\n'.join(htmlParts) except Exception as e: self.logger.error(f"Error generating HTML from JSON: {str(e)}") raise Exception(f"HTML generation failed: {str(e)}") async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]: """Get style set - default styles, enhanced with AI if userPrompt provided. Args: userPrompt: User's prompt (AI will detect style instructions in any language) aiService: AI service (used only if userPrompt provided) templateName: Name of template style set (None = default) Returns: Dict with style definitions for all document styles """ # Get default style set defaultStyleSet = self._getDefaultStyleSet() # Enhance with AI if userPrompt provided (AI handles multilingual style detection) if userPrompt and aiService: # AI will naturally detect style instructions in any language self.logger.info(f"Enhancing styles with AI based on user prompt...") enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService) return self._validateStylesContrast(enhancedStyleSet) else: # Use default styles only return defaultStyleSet async def _enhanceStylesWithAI(self, userPrompt: str, defaultStyleSet: Dict[str, Any], aiService) -> Dict[str, Any]: """Enhance default styles with AI based on user prompt.""" try: style_template = self._createAiStyleTemplate("html", userPrompt, defaultStyleSet) enhanced_styles = await self._getAiStyles(aiService, style_template, defaultStyleSet) return enhanced_styles except Exception as e: self.logger.warning(f"AI style enhancement failed: {str(e)}, using default styles") return defaultStyleSet def _validateStylesContrast(self, styles: Dict[str, Any]) -> Dict[str, Any]: """Validate and fix contrast issues in AI-generated styles.""" try: # Fix table header contrast if "table_header" in styles: header = styles["table_header"] bgColor = header.get("background", "#FFFFFF") textColor = header.get("color", "#000000") # If both are white or both are dark, fix it if bgColor.upper() == "#FFFFFF" and textColor.upper() == "#FFFFFF": header["background"] = "#4F4F4F" header["color"] = "#FFFFFF" elif bgColor.upper() == "#000000" and textColor.upper() == "#000000": header["background"] = "#4F4F4F" header["color"] = "#FFFFFF" # Fix table cell contrast if "table_cell" in styles: cell = styles["table_cell"] bgColor = cell.get("background", "#FFFFFF") textColor = cell.get("color", "#000000") # If both are white or both are dark, fix it if bgColor.upper() == "#FFFFFF" and textColor.upper() == "#FFFFFF": cell["background"] = "#FFFFFF" cell["color"] = "#2F2F2F" elif bgColor.upper() == "#000000" and textColor.upper() == "#000000": cell["background"] = "#FFFFFF" cell["color"] = "#2F2F2F" return styles except Exception as e: self.logger.warning(f"Style validation failed: {str(e)}") return self._getDefaultStyleSet() def _getDefaultStyleSet(self) -> Dict[str, Any]: """Default HTML style set - used when no style instructions present.""" return { "title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"}, "heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"}, "heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"}, "paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"}, "table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"}, "table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"}, "table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"}, "bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"}, "code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"}, "image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"}, "body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"} } def _generateCssStyles(self, styles: Dict[str, Any]) -> str: """Generate CSS from style definitions.""" css_parts = [] # Body styles body_style = styles.get("body", {}) css_parts.append("body {") for property_name, value in body_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Document title title_style = styles.get("title", {}) css_parts.append(".document-title {") for property_name, value in title_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Headings for heading_level in ["heading1", "heading2"]: heading_style = styles.get(heading_level, {}) css_class = f"h{heading_level[-1]}" css_parts.append(f"{css_class} {{") for property_name, value in heading_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Paragraphs paragraph_style = styles.get("paragraph", {}) css_parts.append("p {") for property_name, value in paragraph_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Tables table_style = styles.get("table", {}) css_parts.append("table {") for property_name, value in table_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Table headers table_header_style = styles.get("table_header", {}) css_parts.append("th {") for property_name, value in table_header_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Table cells table_cell_style = styles.get("table_cell", {}) css_parts.append("td {") for property_name, value in table_cell_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Lists bullet_list_style = styles.get("bullet_list", {}) css_parts.append("ul {") for property_name, value in bullet_list_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Code blocks code_block_style = styles.get("code_block", {}) css_parts.append("pre {") for property_name, value in code_block_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Images image_style = styles.get("image", {}) css_parts.append("img {") for property_name, value in image_style.items(): css_property = property_name.replace("_", "-") css_parts.append(f" {css_property}: {value};") css_parts.append("}") # Generated info css_parts.append(".generated-info {") css_parts.append(" font-size: 0.9em;") css_parts.append(" color: #666;") css_parts.append(" text-align: center;") css_parts.append(" margin-top: 2em;") css_parts.append(" padding-top: 1em;") css_parts.append(" border-top: 1px solid #ddd;") css_parts.append("}") return '\n'.join(css_parts) def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a single JSON section to HTML using AI-generated styles. Supports three content formats: reference, object (base64), extracted_text. WICHTIG: Respektiert sectionType (content_type) für korrekte Rendering-Logik. """ try: sectionType = self._getSectionType(section) sectionData = self._getSectionData(section) # WICHTIG: Respektiere sectionType (content_type) ZUERST, dann process elements entsprechend # Process elements according to section's content_type, not just element types if sectionType == "table": # Process the section data to extract table structure processedData = self._processSectionByType(section) return self._renderJsonTable(processedData, styles) elif sectionType == "bullet_list": # Process the section data to extract bullet list structure processedData = self._processSectionByType(section) return self._renderJsonBulletList(processedData, styles) elif sectionType == "heading": # Extract text from elements for heading rendering if isinstance(sectionData, list): # Extract text from heading elements headingText = "" for element in sectionData: if isinstance(element, dict): element_type = element.get("type", "") if element_type == "heading": headingText = element.get("content", element.get("text", "")) break elif element_type == "extracted_text": # Use extracted text as heading if no heading element found content = element.get("content", "") if content and not headingText: # Extract first line or title from extracted text headingText = content.split('\n')[0].strip() # Remove markdown formatting headingText = headingText.replace('#', '').replace('**', '').strip() break elif "text" in element: headingText = element.get("text", "") break if headingText: return self._renderJsonHeading({"text": headingText, "level": 2}, styles) return self._renderJsonHeading(sectionData, styles) elif sectionType == "paragraph": # Process paragraph elements, including extracted_text if isinstance(sectionData, list): htmlParts = [] for element in sectionData: element_type = element.get("type", "") if isinstance(element, dict) else "" if element_type == "reference": doc_ref = element.get("documentReference", "") label = element.get("label", "Reference") htmlParts.append(f'

[Reference: {label}]

') elif element_type == "extracted_text": content = element.get("content", "") source = element.get("source", "") if content: source_text = f' (Source: {source})' if source else '' htmlParts.append(f'

{content}{source_text}

') elif isinstance(element, dict): # Regular paragraph element text = element.get("text", element.get("content", "")) if text: htmlParts.append(f'

{text}

') elif isinstance(element, str): htmlParts.append(f'

{element}

') if htmlParts: return '\n'.join(htmlParts) return self._renderJsonParagraph(sectionData, styles) elif sectionType == "code_block": # Process the section data to extract code block structure processedData = self._processSectionByType(section) return self._renderJsonCodeBlock(processedData, styles) elif sectionType == "image": # Process the section data to extract image structure processedData = self._processSectionByType(section) return self._renderJsonImage(processedData, styles) else: # Fallback: Check for special element types first if isinstance(sectionData, list): htmlParts = [] for element in sectionData: element_type = element.get("type", "") if isinstance(element, dict) else "" if element_type == "reference": doc_ref = element.get("documentReference", "") label = element.get("label", "Reference") htmlParts.append(f'

[Reference: {label}]

') elif element_type == "extracted_text": content = element.get("content", "") source = element.get("source", "") if content: source_text = f' (Source: {source})' if source else '' htmlParts.append(f'

{content}{source_text}

') if htmlParts: return '\n'.join(htmlParts) # Fallback to paragraph for unknown types return self._renderJsonParagraph(sectionData, styles) except Exception as e: self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}") return f'
[Error rendering section: {str(e)}]
' def _renderJsonTable(self, tableData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON table to HTML using AI-generated styles.""" try: headers = tableData.get("headers", []) rows = tableData.get("rows", []) if not headers or not rows: return "" htmlParts = [''] # Table header htmlParts.append('') for header in headers: htmlParts.append(f'') htmlParts.append('') # Table body htmlParts.append('') for row in rows: htmlParts.append('') for cellData in row: htmlParts.append(f'') htmlParts.append('') htmlParts.append('') htmlParts.append('
{header}
{cellData}
') return '\n'.join(htmlParts) except Exception as e: self.logger.warning(f"Error rendering table: {str(e)}") return "" def _renderJsonBulletList(self, listData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON bullet list to HTML using AI-generated styles.""" try: items = listData.get("items", []) if not items: return "" htmlParts = ['') return '\n'.join(htmlParts) except Exception as e: self.logger.warning(f"Error rendering bullet list: {str(e)}") return "" def _renderJsonHeading(self, headingData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON heading to HTML using AI-generated styles.""" try: # Normalize inputs - headingData is typically a list of elements from _getSectionData if isinstance(headingData, list): # Extract first element from elements array if headingData and len(headingData) > 0: headingData = headingData[0] if isinstance(headingData[0], dict) else {} else: return "" elif isinstance(headingData, str): headingData = {"text": headingData, "level": 2} elif not isinstance(headingData, dict): return "" level = headingData.get("level", 1) text = headingData.get("text", "") if text: level = max(1, min(6, level)) return f'{text}' return "" except Exception as e: self.logger.warning(f"Error rendering heading: {str(e)}") return "" def _renderJsonParagraph(self, paragraphData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON paragraph to HTML using AI-generated styles.""" try: # Normalize inputs - paragraphData is typically a list of elements from _getSectionData if isinstance(paragraphData, list): # Extract text from all paragraph elements texts = [] for el in paragraphData: if isinstance(el, dict) and "text" in el: texts.append(el["text"]) elif isinstance(el, str): texts.append(el) if texts: # Join multiple paragraphs with

tags return '\n'.join(f'

{text}

' for text in texts) return "" elif isinstance(paragraphData, str): return f'

{paragraphData}

' elif isinstance(paragraphData, dict): text = paragraphData.get("text", "") if text: return f'

{text}

' return "" else: return "" except Exception as e: self.logger.warning(f"Error rendering paragraph: {str(e)}") return "" def _renderJsonCodeBlock(self, codeData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON code block to HTML using AI-generated styles.""" try: code = codeData.get("code", "") language = codeData.get("language", "") if code: if language: return f'
{code}
' else: return f'
{code}
' return "" except Exception as e: self.logger.warning(f"Error rendering code block: {str(e)}") return "" def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a JSON image to HTML with placeholder for later replacement.""" try: import html base64Data = imageData.get("base64Data", "") altText = imageData.get("altText", "Image") caption = imageData.get("caption", "") # Escape HTML in altText and caption to prevent injection altTextEscaped = html.escape(str(altText)) captionEscaped = html.escape(str(caption)) if caption else "" if base64Data: # Use data URI as placeholder - will be replaced with file path in _replaceImageDataUris # Include a marker so we can find and replace it imageMarker = f"" imgTag = f'{altTextEscaped}' if captionEscaped: return f'{imageMarker}
{imgTag}
{captionEscaped}
' else: return f'{imageMarker}{imgTag}' return "" except Exception as e: self.logger.warning(f"Error rendering image: {str(e)}") return f'
[Image: {imageData.get("altText", "Image")}]
' def _extractImages(self, jsonContent: Dict[str, Any]) -> List[Dict[str, Any]]: """ Extract all images from JSON structure. Returns: List of image data dictionaries with base64Data, altText, caption, sectionId """ images = [] try: # Extract from standardized schema: {metadata: {...}, documents: [{sections: [...]}]} documents = jsonContent.get("documents", []) if not documents or not isinstance(documents, list): return images for doc in documents: if not isinstance(doc, dict): continue sections = doc.get("sections", []) for section in sections: if section.get("content_type") == "image": elements = section.get("elements", []) for element in elements: base64Data = element.get("base64Data", "") # If base64Data not found, try extracting from url data URI if not base64Data: url = element.get("url", "") if url.startswith("data:image/"): # Extract base64 from data URI: data:image/png;base64, import re match = re.match(r'data:image/[^;]+;base64,(.+)', url) if match: base64Data = match.group(1) if base64Data: sectionId = section.get("id", "unknown") # Bestimme MIME-Type und Extension mimeType = element.get("mimeType", "image/png") if not mimeType or mimeType == "unknown": # Versuche MIME-Type aus base64 zu erkennen if base64Data.startswith("/9j/"): mimeType = "image/jpeg" elif base64Data.startswith("iVBORw0KGgo"): mimeType = "image/png" else: mimeType = "image/png" # Default # Bestimme Extension basierend auf MIME-Type extension = "png" if mimeType == "image/jpeg" or mimeType == "image/jpg": extension = "jpg" elif mimeType == "image/png": extension = "png" elif mimeType == "image/gif": extension = "gif" elif mimeType == "image/webp": extension = "webp" # Generate filename from section ID filename = f"{sectionId}.{extension}" # Clean filename (remove invalid characters) filename = "".join(c if c.isalnum() or c in "._-" else "_" for c in filename) images.append({ "base64Data": base64Data, "altText": element.get("altText", "Image"), "caption": element.get("caption"), "sectionId": sectionId, "filename": filename, "mimeType": mimeType }) self.logger.debug(f"Extracted image from section {sectionId}: {filename}") self.logger.info(f"Extracted {len(images)} image(s) from JSON structure") return images except Exception as e: self.logger.warning(f"Error extracting images: {str(e)}") return [] def _replaceImageDataUris(self, htmlContent: str, images: List[Dict[str, Any]]) -> str: """ Replace base64 data URIs in HTML with relative file paths. Args: htmlContent: HTML content with data URIs images: List of image data dictionaries Returns: HTML content with relative file paths """ try: import base64 import re # Find all image data URIs in HTML (verschiedene MIME-Types unterstützen) # Pattern: data:image/[type];base64, dataUriPattern = r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)' def replaceDataUri(match): base64Data = match.group(1) # Find matching image in images list matchingImage = None for img in images: imgBase64 = img.get("base64Data", "") # Vergleiche base64-Daten (kann unterschiedliche Längen haben durch Padding) if imgBase64 == base64Data or imgBase64.startswith(base64Data[:100]) or base64Data.startswith(imgBase64[:100]): matchingImage = img break if matchingImage: import html # Use filename from image data (generated from section ID) filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png") # Replace with relative path (ohne Pfad, nur Dateiname) # Escape HTML in altText and caption to prevent injection altText = html.escape(str(matchingImage.get("altText", "Image"))) caption = html.escape(str(matchingImage.get("caption", ""))) if matchingImage.get("caption") else "" # Entferne IMAGE_MARKER Kommentar falls vorhanden imgTag = f'{altText}' if caption: return f'
{imgTag}
{caption}
' else: return imgTag else: # Keep original if no match found return match.group(0) # Replace all data URIs (auch IMAGE_MARKER Kommentare entfernen) updatedHtml = re.sub(dataUriPattern, replaceDataUri, htmlContent) # Entferne IMAGE_MARKER Kommentare die übrig geblieben sind updatedHtml = re.sub(r'', '', updatedHtml) return updatedHtml except Exception as e: self.logger.warning(f"Error replacing image data URIs: {str(e)}") return htmlContent # Return original if replacement fails def getRenderedImages(self) -> List[Dict[str, Any]]: """ Get images that were extracted during rendering. Returns list of image dicts with base64Data, altText, caption, and filename. """ if not hasattr(self, '_renderedImages'): return [] return self._renderedImages