# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
HTML renderer for report generation.
"""
from .documentRendererBaseTemplate import BaseRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional
class RendererHtml(BaseRenderer):
"""Renders content to HTML format with format-specific extraction."""
@classmethod
def getSupportedFormats(cls) -> List[str]:
"""Return supported HTML formats."""
return ['html', 'htm']
@classmethod
def getFormatAliases(cls) -> List[str]:
"""Return format aliases."""
return ['web', 'webpage']
@classmethod
def getPriority(cls) -> int:
"""Return priority for HTML renderer."""
return 100
@classmethod
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
"""Return output style classification: HTML web pages are rendered documents."""
return 'document'
@classmethod
def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]:
"""
Return list of section content types that HTML renderer accepts.
HTML renderer accepts all section types (HTML pages can contain all content types including images).
"""
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
"""
Render HTML document with images as separate files.
Returns list of documents: [HTML document, image1, image2, ...]
"""
import base64
# Extract images first
images = self._extractImages(extractedContent)
# Store images in instance for later retrieval
self._renderedImages = images
# Generate HTML using AI-analyzed styling
htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService)
# Replace base64 data URIs with relative file paths if images exist
if images:
htmlContent = self._replaceImageDataUris(htmlContent, images)
# Determine HTML filename from document or title
documents = extractedContent.get("documents", [])
if documents and isinstance(documents[0], dict):
htmlFilename = documents[0].get("filename")
if not htmlFilename:
htmlFilename = self._determineFilename(title, "text/html")
else:
htmlFilename = self._determineFilename(title, "text/html")
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
# Start with HTML document
resultDocuments = [
RenderedDocument(
documentData=htmlContent.encode('utf-8'),
mimeType="text/html",
filename=htmlFilename,
documentType=documentType,
metadata=metadata if isinstance(metadata, dict) else None
)
]
# Add images as separate documents
for img in images:
base64Data = img.get("base64Data", "")
filename = img.get("filename", f"image_{len(resultDocuments)}.png")
mimeType = img.get("mimeType", "image/png")
if base64Data:
try:
# Decode base64 to bytes
imageBytes = base64.b64decode(base64Data)
resultDocuments.append(
RenderedDocument(
documentData=imageBytes,
mimeType=mimeType,
filename=filename
)
)
self.logger.debug(f"Added image file: {filename} ({len(imageBytes)} bytes)")
except Exception as e:
self.logger.warning(f"Error creating image file {filename}: {str(e)}")
return resultDocuments
async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
"""Generate HTML content from structured JSON document using AI-generated styling."""
try:
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
# Validate JSON structure
if not self._validateJsonStructure(jsonContent):
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
# Extract sections and metadata from standardized schema
sections = self._extractSections(jsonContent)
metadata = self._extractMetadata(jsonContent)
# Use provided title (which comes from documents[].title) as primary source
# Fallback to metadata.title only if title parameter is empty
documentTitle = title if title else metadata.get("title", "Generated Document")
# Build HTML document
htmlParts = []
# HTML document structure
htmlParts.append('')
htmlParts.append('')
htmlParts.append('
')
if htmlParts:
return '\n'.join(htmlParts)
# If sectionData is not a list, treat it as a dict
if isinstance(sectionData, dict):
return self._renderJsonParagraph(sectionData, styles)
return ""
elif sectionType == "code_block":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonCodeBlock(element, styles)
return ""
elif sectionType == "image":
# Work directly with elements like other renderers
if isinstance(sectionData, list) and sectionData:
element = sectionData[0] if isinstance(sectionData[0], dict) else {}
return self._renderJsonImage(element, styles)
return ""
else:
# Fallback: Check for special element types first
if isinstance(sectionData, list):
htmlParts = []
for element in sectionData:
element_type = element.get("type", "") if isinstance(element, dict) else ""
if element_type == "reference":
doc_ref = element.get("documentReference", "")
label = element.get("label", "Reference")
htmlParts.append(f'
[Reference: {label}]
')
elif element_type == "extracted_text":
content = element.get("content", "")
source = element.get("source", "")
if content:
source_text = f' (Source: {source})' if source else ''
htmlParts.append(f'
{content}{source_text}
')
if htmlParts:
return '\n'.join(htmlParts)
# Fallback to paragraph for unknown types
if isinstance(sectionData, dict):
return self._renderJsonParagraph(sectionData, styles)
return ""
except Exception as e:
self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}")
return f'
[Error rendering section: {str(e)}]
'
def _renderJsonTable(self, tableData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON table to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{headers, rows}
content = tableData.get("content", {})
if not isinstance(content, dict):
return ""
headers = content.get("headers", [])
rows = content.get("rows", [])
if not headers or not rows:
return ""
htmlParts = ['
']
# Table header
htmlParts.append('
')
for header in headers:
htmlParts.append(f'
{header}
')
htmlParts.append('
')
# Table body
htmlParts.append('')
for row in rows:
htmlParts.append('
')
for cellData in row:
htmlParts.append(f'
{cellData}
')
htmlParts.append('
')
htmlParts.append('')
htmlParts.append('
')
return '\n'.join(htmlParts)
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return ""
def _renderJsonBulletList(self, listData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON bullet list to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{items}
content = listData.get("content", {})
if not isinstance(content, dict):
return ""
items = content.get("items", [])
if not items:
return ""
htmlParts = ['
']
for item in items:
if isinstance(item, str):
htmlParts.append(f'
{item}
')
elif isinstance(item, dict) and "text" in item:
htmlParts.append(f'
{item["text"]}
')
htmlParts.append('
')
return '\n'.join(htmlParts)
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
return ""
def _renderJsonHeading(self, headingData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON heading to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{text, level}
content = headingData.get("content", {})
if not isinstance(content, dict):
return ""
text = content.get("text", "")
level = content.get("level", 1)
if text:
level = max(1, min(6, level))
return f'{text}'
return ""
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
return ""
def _renderJsonParagraph(self, paragraphData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON paragraph to HTML using AI-generated styles."""
try:
# Normalize inputs - paragraphData is typically a list of elements from _getSectionData
if isinstance(paragraphData, list):
# Extract text from all paragraph elements (expects nested content structure)
texts = []
for el in paragraphData:
if isinstance(el, dict):
content = el.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
texts.append(text)
elif isinstance(el, str):
texts.append(el)
if texts:
# Join multiple paragraphs with
tags
return '\n'.join(f'
{text}
' for text in texts)
return ""
elif isinstance(paragraphData, str):
return f'
{paragraphData}
'
elif isinstance(paragraphData, dict):
# Handle nested content structure: element.content vs element.text
# Extract from nested content structure
content = paragraphData.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
return f'
{text}
'
return ""
else:
return ""
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return ""
def _renderJsonCodeBlock(self, codeData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON code block to HTML using AI-generated styles."""
try:
# Extract from nested content structure: element.content.{code, language}
content = codeData.get("content", {})
if not isinstance(content, dict):
return ""
code = content.get("code", "")
language = content.get("language", "")
if code:
if language:
return f'
{code}
'
else:
return f'
{code}
'
return ""
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
return ""
def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON image to HTML with placeholder for later replacement. Expects nested content structure."""
try:
import html
# Extract from nested content structure (standard JSON format)
content = imageData.get("content", {})
if not isinstance(content, dict):
return ""
base64Data = content.get("base64Data", "")
altText = content.get("altText", "Image")
caption = content.get("caption", "")
# Escape HTML in altText and caption to prevent injection
altTextEscaped = html.escape(str(altText))
captionEscaped = html.escape(str(caption)) if caption else ""
if base64Data:
# Use data URI as placeholder - will be replaced with file path in _replaceImageDataUris
# Include a marker so we can find and replace it
imageMarker = f""
# Add max-width and max-height to ensure image fits within page dimensions
# Typical page width is ~800-1200px, height varies but we limit to 600px for readability
imgTag = f''
if captionEscaped:
return f'{imageMarker}{imgTag}{captionEscaped}'
else:
return f'{imageMarker}{imgTag}'
return ""
except Exception as e:
self.logger.error(f"Error embedding image in HTML: {str(e)}")
altText = imageData.get("altText", "Image")
errorMsg = html.escape(f"[Error: Could not embed image '{altText}'. {str(e)}]")
return f'
{errorMsg}
'
def _extractImages(self, jsonContent: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Extract all images from JSON structure.
Returns:
List of image data dictionaries with base64Data, altText, caption, sectionId
"""
images = []
try:
# Extract from standardized schema: {metadata: {...}, documents: [{sections: [...]}]}
documents = jsonContent.get("documents", [])
if not documents or not isinstance(documents, list):
return images
for doc in documents:
if not isinstance(doc, dict):
continue
sections = doc.get("sections", [])
for section in sections:
if section.get("content_type") == "image":
elements = section.get("elements", [])
for element in elements:
# Extract from nested content structure
content = element.get("content", {})
base64Data = ""
if isinstance(content, dict):
base64Data = content.get("base64Data", "")
elif isinstance(content, str):
# Content might be base64 string directly (shouldn't happen)
pass
# If base64Data not found in content, try direct element fields (fallback)
if not base64Data:
base64Data = element.get("base64Data", "")
# If base64Data still not found, try extracting from url data URI
if not base64Data:
url = element.get("url", "") or (content.get("url", "") if isinstance(content, dict) else "")
if url and isinstance(url, str) and url.startswith("data:image/"):
# Extract base64 from data URI: data:image/png;base64,
import re
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
if match:
base64Data = match.group(1)
if base64Data:
sectionId = section.get("id", "unknown")
# Bestimme MIME-Type und Extension
mimeType = element.get("mimeType", "") or (content.get("mimeType", "") if isinstance(content, dict) else "")
if not mimeType or mimeType == "unknown":
# Versuche MIME-Type aus base64 zu erkennen
if base64Data.startswith("/9j/"):
mimeType = "image/jpeg"
elif base64Data.startswith("iVBORw0KGgo"):
mimeType = "image/png"
else:
mimeType = "image/png" # Default
# Bestimme Extension basierend auf MIME-Type
extension = "png"
if mimeType == "image/jpeg" or mimeType == "image/jpg":
extension = "jpg"
elif mimeType == "image/png":
extension = "png"
elif mimeType == "image/gif":
extension = "gif"
elif mimeType == "image/webp":
extension = "webp"
# Generate filename from section ID
filename = f"{sectionId}.{extension}"
# Clean filename (remove invalid characters)
filename = "".join(c if c.isalnum() or c in "._-" else "_" for c in filename)
images.append({
"base64Data": base64Data,
"altText": element.get("altText", "Image"),
"caption": element.get("caption"),
"sectionId": sectionId,
"filename": filename,
"mimeType": mimeType
})
self.logger.debug(f"Extracted image from section {sectionId}: {filename}")
self.logger.info(f"Extracted {len(images)} image(s) from JSON structure")
return images
except Exception as e:
self.logger.warning(f"Error extracting images: {str(e)}")
return []
def _replaceImageDataUris(self, htmlContent: str, images: List[Dict[str, Any]]) -> str:
"""
Replace base64 data URIs in HTML with relative file paths.
Args:
htmlContent: HTML content with data URIs
images: List of image data dictionaries
Returns:
HTML content with relative file paths
"""
try:
import base64
import re
# Find entire img tags with data URIs and replace them
# Pattern:
imgTagPattern = r']*>'
def replaceImgTag(match):
imgTag = match.group(0)
# Extract base64 data from the img tag
base64Match = re.search(r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)', imgTag)
if not base64Match:
return imgTag # Return original if no base64 found
base64Data = base64Match.group(1)
# Find matching image in images list
matchingImage = None
for img in images:
imgBase64 = img.get("base64Data", "")
# Vergleiche base64-Daten (kann unterschiedliche Längen haben durch Padding)
if imgBase64 == base64Data or imgBase64.startswith(base64Data[:100]) or base64Data.startswith(imgBase64[:100]):
matchingImage = img
break
if matchingImage:
import html
# Use filename from image data (generated from section ID)
filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png")
# Extract existing alt text or use from matchingImage
altMatch = re.search(r'alt="([^"]*)"', imgTag)
existingAlt = altMatch.group(1) if altMatch else ""
altText = html.escape(str(matchingImage.get("altText", existingAlt or "Image")))
caption = html.escape(str(matchingImage.get("caption", ""))) if matchingImage.get("caption") else ""
# Create new img tag with filename
imgTag = f''
if caption:
return f'{imgTag}{caption}'
else:
return imgTag
else:
# Keep original if no match found
return match.group(0)
# Replace all img tags with data URIs (auch IMAGE_MARKER Kommentare entfernen)
updatedHtml = re.sub(imgTagPattern, replaceImgTag, htmlContent)
# Entferne IMAGE_MARKER Kommentare die übrig geblieben sind
updatedHtml = re.sub(r'', '', updatedHtml)
return updatedHtml
except Exception as e:
self.logger.warning(f"Error replacing image data URIs: {str(e)}")
return htmlContent # Return original if replacement fails
def getRenderedImages(self) -> List[Dict[str, Any]]:
"""
Get images that were extracted during rendering.
Returns list of image dicts with base64Data, altText, caption, and filename.
"""
if not hasattr(self, '_renderedImages'):
return []
return self._renderedImages