681 lines
32 KiB
Python
681 lines
32 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
HTML renderer for report generation.
|
|
"""
|
|
|
|
from .rendererBaseTemplate import BaseRenderer
|
|
from typing import Dict, Any, Tuple, List
|
|
|
|
class RendererHtml(BaseRenderer):
|
|
"""Renders content to HTML format with format-specific extraction."""
|
|
|
|
@classmethod
|
|
def getSupportedFormats(cls) -> List[str]:
|
|
"""Return supported HTML formats."""
|
|
return ['html', 'htm']
|
|
|
|
@classmethod
|
|
def getFormatAliases(cls) -> List[str]:
|
|
"""Return format aliases."""
|
|
return ['web', 'webpage']
|
|
|
|
@classmethod
|
|
def getPriority(cls) -> int:
|
|
"""Return priority for HTML renderer."""
|
|
return 100
|
|
|
|
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> Tuple[str, str]:
|
|
"""Render extracted JSON content to HTML format using AI-analyzed styling."""
|
|
try:
|
|
# Extract images first
|
|
images = self._extractImages(extractedContent)
|
|
|
|
# Store images in instance for later retrieval
|
|
self._renderedImages = images
|
|
|
|
# Generate HTML using AI-analyzed styling
|
|
htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService)
|
|
|
|
# Replace base64 data URIs with relative file paths if images exist
|
|
if images:
|
|
htmlContent = self._replaceImageDataUris(htmlContent, images)
|
|
|
|
return htmlContent, "text/html"
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error rendering HTML: {str(e)}")
|
|
# Return minimal HTML fallback
|
|
self._renderedImages = [] # Initialize empty list on error
|
|
return f"<html><head><title>{title}</title></head><body><h1>{title}</h1><p>Error rendering report: {str(e)}</p></body></html>", "text/html"
|
|
|
|
async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
|
|
"""Generate HTML content from structured JSON document using AI-generated styling."""
|
|
try:
|
|
# Get style set: default styles, enhanced with AI if userPrompt provided
|
|
styles = await self._getStyleSet(userPrompt, aiService)
|
|
|
|
# Validate JSON structure
|
|
if not self._validateJsonStructure(jsonContent):
|
|
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
|
|
|
|
# Extract sections and metadata from standardized schema
|
|
sections = self._extractSections(jsonContent)
|
|
metadata = self._extractMetadata(jsonContent)
|
|
|
|
# Use title from JSON metadata if available, otherwise use provided title
|
|
documentTitle = metadata.get("title", title)
|
|
|
|
# Build HTML document
|
|
htmlParts = []
|
|
|
|
# HTML document structure
|
|
htmlParts.append('<!DOCTYPE html>')
|
|
htmlParts.append('<html lang="en">')
|
|
htmlParts.append('<head>')
|
|
htmlParts.append('<meta charset="UTF-8">')
|
|
htmlParts.append('<meta name="viewport" content="width=device-width, initial-scale=1.0">')
|
|
htmlParts.append(f'<title>{documentTitle}</title>')
|
|
htmlParts.append('<style>')
|
|
htmlParts.append(self._generateCssStyles(styles))
|
|
htmlParts.append('</style>')
|
|
htmlParts.append('</head>')
|
|
htmlParts.append('<body>')
|
|
|
|
# Document header
|
|
htmlParts.append(f'<header><h1 class="document-title">{documentTitle}</h1></header>')
|
|
|
|
# Main content
|
|
htmlParts.append('<main>')
|
|
|
|
# Process each section
|
|
for section in sections:
|
|
sectionHtml = self._renderJsonSection(section, styles)
|
|
if sectionHtml:
|
|
htmlParts.append(sectionHtml)
|
|
|
|
htmlParts.append('</main>')
|
|
|
|
# Footer
|
|
htmlParts.append('<footer>')
|
|
htmlParts.append(f'<p class="generated-info">Generated: {self._formatTimestamp()}</p>')
|
|
htmlParts.append('</footer>')
|
|
|
|
htmlParts.append('</body>')
|
|
htmlParts.append('</html>')
|
|
|
|
return '\n'.join(htmlParts)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error generating HTML from JSON: {str(e)}")
|
|
raise Exception(f"HTML generation failed: {str(e)}")
|
|
|
|
async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
|
"""Get style set - default styles, enhanced with AI if userPrompt provided.
|
|
|
|
Args:
|
|
userPrompt: User's prompt (AI will detect style instructions in any language)
|
|
aiService: AI service (used only if userPrompt provided)
|
|
templateName: Name of template style set (None = default)
|
|
|
|
Returns:
|
|
Dict with style definitions for all document styles
|
|
"""
|
|
# Get default style set
|
|
defaultStyleSet = self._getDefaultStyleSet()
|
|
|
|
# Enhance with AI if userPrompt provided (AI handles multilingual style detection)
|
|
if userPrompt and aiService:
|
|
# AI will naturally detect style instructions in any language
|
|
self.logger.info(f"Enhancing styles with AI based on user prompt...")
|
|
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
|
return self._validateStylesContrast(enhancedStyleSet)
|
|
else:
|
|
# Use default styles only
|
|
return defaultStyleSet
|
|
|
|
async def _enhanceStylesWithAI(self, userPrompt: str, defaultStyleSet: Dict[str, Any], aiService) -> Dict[str, Any]:
|
|
"""Enhance default styles with AI based on user prompt."""
|
|
try:
|
|
style_template = self._createAiStyleTemplate("html", userPrompt, defaultStyleSet)
|
|
enhanced_styles = await self._getAiStyles(aiService, style_template, defaultStyleSet)
|
|
return enhanced_styles
|
|
except Exception as e:
|
|
self.logger.warning(f"AI style enhancement failed: {str(e)}, using default styles")
|
|
return defaultStyleSet
|
|
|
|
def _validateStylesContrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Validate and fix contrast issues in AI-generated styles."""
|
|
try:
|
|
# Fix table header contrast
|
|
if "table_header" in styles:
|
|
header = styles["table_header"]
|
|
bgColor = header.get("background", "#FFFFFF")
|
|
textColor = header.get("color", "#000000")
|
|
|
|
# If both are white or both are dark, fix it
|
|
if bgColor.upper() == "#FFFFFF" and textColor.upper() == "#FFFFFF":
|
|
header["background"] = "#4F4F4F"
|
|
header["color"] = "#FFFFFF"
|
|
elif bgColor.upper() == "#000000" and textColor.upper() == "#000000":
|
|
header["background"] = "#4F4F4F"
|
|
header["color"] = "#FFFFFF"
|
|
|
|
# Fix table cell contrast
|
|
if "table_cell" in styles:
|
|
cell = styles["table_cell"]
|
|
bgColor = cell.get("background", "#FFFFFF")
|
|
textColor = cell.get("color", "#000000")
|
|
|
|
# If both are white or both are dark, fix it
|
|
if bgColor.upper() == "#FFFFFF" and textColor.upper() == "#FFFFFF":
|
|
cell["background"] = "#FFFFFF"
|
|
cell["color"] = "#2F2F2F"
|
|
elif bgColor.upper() == "#000000" and textColor.upper() == "#000000":
|
|
cell["background"] = "#FFFFFF"
|
|
cell["color"] = "#2F2F2F"
|
|
|
|
return styles
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Style validation failed: {str(e)}")
|
|
return self._getDefaultStyleSet()
|
|
|
|
def _getDefaultStyleSet(self) -> Dict[str, Any]:
|
|
"""Default HTML style set - used when no style instructions present."""
|
|
return {
|
|
"title": {"font_size": "2.5em", "color": "#1F4E79", "font_weight": "bold", "text_align": "center", "margin": "0 0 1em 0"},
|
|
"heading1": {"font_size": "2em", "color": "#2F2F2F", "font_weight": "bold", "text_align": "left", "margin": "1.5em 0 0.5em 0"},
|
|
"heading2": {"font_size": "1.5em", "color": "#4F4F4F", "font_weight": "bold", "text_align": "left", "margin": "1em 0 0.5em 0"},
|
|
"paragraph": {"font_size": "1em", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "margin": "0 0 1em 0", "line_height": "1.6"},
|
|
"table": {"border": "1px solid #ddd", "border_collapse": "collapse", "width": "100%", "margin": "1em 0"},
|
|
"table_header": {"background": "#4F4F4F", "color": "#FFFFFF", "font_weight": "bold", "text_align": "center", "padding": "12px"},
|
|
"table_cell": {"background": "#FFFFFF", "color": "#2F2F2F", "font_weight": "normal", "text_align": "left", "padding": "8px", "border": "1px solid #ddd"},
|
|
"bullet_list": {"font_size": "1em", "color": "#2F2F2F", "margin": "0 0 1em 0", "padding_left": "20px"},
|
|
"code_block": {"font_family": "Courier New, monospace", "font_size": "0.9em", "color": "#2F2F2F", "background": "#F5F5F5", "padding": "1em", "border": "1px solid #ddd", "border_radius": "4px", "margin": "1em 0"},
|
|
"image": {"max_width": "100%", "height": "auto", "margin": "1em 0", "border_radius": "4px"},
|
|
"body": {"font_family": "Arial, sans-serif", "background": "#FFFFFF", "color": "#2F2F2F", "margin": "0", "padding": "20px"}
|
|
}
|
|
|
|
|
|
def _generateCssStyles(self, styles: Dict[str, Any]) -> str:
|
|
"""Generate CSS from style definitions."""
|
|
css_parts = []
|
|
|
|
# Body styles
|
|
body_style = styles.get("body", {})
|
|
css_parts.append("body {")
|
|
for property_name, value in body_style.items():
|
|
css_property = property_name.replace("_", "-")
|
|
css_parts.append(f" {css_property}: {value};")
|
|
css_parts.append("}")
|
|
|
|
# Document title
|
|
title_style = styles.get("title", {})
|
|
css_parts.append(".document-title {")
|
|
for property_name, value in title_style.items():
|
|
css_property = property_name.replace("_", "-")
|
|
css_parts.append(f" {css_property}: {value};")
|
|
css_parts.append("}")
|
|
|
|
# Headings
|
|
for heading_level in ["heading1", "heading2"]:
|
|
heading_style = styles.get(heading_level, {})
|
|
css_class = f"h{heading_level[-1]}"
|
|
css_parts.append(f"{css_class} {{")
|
|
for property_name, value in heading_style.items():
|
|
css_property = property_name.replace("_", "-")
|
|
css_parts.append(f" {css_property}: {value};")
|
|
css_parts.append("}")
|
|
|
|
# Paragraphs
|
|
paragraph_style = styles.get("paragraph", {})
|
|
css_parts.append("p {")
|
|
for property_name, value in paragraph_style.items():
|
|
css_property = property_name.replace("_", "-")
|
|
css_parts.append(f" {css_property}: {value};")
|
|
css_parts.append("}")
|
|
|
|
# Tables
|
|
table_style = styles.get("table", {})
|
|
css_parts.append("table {")
|
|
for property_name, value in table_style.items():
|
|
css_property = property_name.replace("_", "-")
|
|
css_parts.append(f" {css_property}: {value};")
|
|
css_parts.append("}")
|
|
|
|
# Table headers
|
|
table_header_style = styles.get("table_header", {})
|
|
css_parts.append("th {")
|
|
for property_name, value in table_header_style.items():
|
|
css_property = property_name.replace("_", "-")
|
|
css_parts.append(f" {css_property}: {value};")
|
|
css_parts.append("}")
|
|
|
|
# Table cells
|
|
table_cell_style = styles.get("table_cell", {})
|
|
css_parts.append("td {")
|
|
for property_name, value in table_cell_style.items():
|
|
css_property = property_name.replace("_", "-")
|
|
css_parts.append(f" {css_property}: {value};")
|
|
css_parts.append("}")
|
|
|
|
# Lists
|
|
bullet_list_style = styles.get("bullet_list", {})
|
|
css_parts.append("ul {")
|
|
for property_name, value in bullet_list_style.items():
|
|
css_property = property_name.replace("_", "-")
|
|
css_parts.append(f" {css_property}: {value};")
|
|
css_parts.append("}")
|
|
|
|
# Code blocks
|
|
code_block_style = styles.get("code_block", {})
|
|
css_parts.append("pre {")
|
|
for property_name, value in code_block_style.items():
|
|
css_property = property_name.replace("_", "-")
|
|
css_parts.append(f" {css_property}: {value};")
|
|
css_parts.append("}")
|
|
|
|
# Images
|
|
image_style = styles.get("image", {})
|
|
css_parts.append("img {")
|
|
for property_name, value in image_style.items():
|
|
css_property = property_name.replace("_", "-")
|
|
css_parts.append(f" {css_property}: {value};")
|
|
css_parts.append("}")
|
|
|
|
# Generated info
|
|
css_parts.append(".generated-info {")
|
|
css_parts.append(" font-size: 0.9em;")
|
|
css_parts.append(" color: #666;")
|
|
css_parts.append(" text-align: center;")
|
|
css_parts.append(" margin-top: 2em;")
|
|
css_parts.append(" padding-top: 1em;")
|
|
css_parts.append(" border-top: 1px solid #ddd;")
|
|
css_parts.append("}")
|
|
|
|
return '\n'.join(css_parts)
|
|
|
|
def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
|
"""Render a single JSON section to HTML using AI-generated styles.
|
|
Supports three content formats: reference, object (base64), extracted_text.
|
|
WICHTIG: Respektiert sectionType (content_type) für korrekte Rendering-Logik.
|
|
"""
|
|
try:
|
|
sectionType = self._getSectionType(section)
|
|
sectionData = self._getSectionData(section)
|
|
|
|
# WICHTIG: Respektiere sectionType (content_type) ZUERST, dann process elements entsprechend
|
|
# Process elements according to section's content_type, not just element types
|
|
|
|
if sectionType == "table":
|
|
# Process the section data to extract table structure
|
|
processedData = self._processSectionByType(section)
|
|
return self._renderJsonTable(processedData, styles)
|
|
elif sectionType == "bullet_list":
|
|
# Process the section data to extract bullet list structure
|
|
processedData = self._processSectionByType(section)
|
|
return self._renderJsonBulletList(processedData, styles)
|
|
elif sectionType == "heading":
|
|
# Extract text from elements for heading rendering
|
|
if isinstance(sectionData, list):
|
|
# Extract text from heading elements
|
|
headingText = ""
|
|
for element in sectionData:
|
|
if isinstance(element, dict):
|
|
element_type = element.get("type", "")
|
|
if element_type == "heading":
|
|
headingText = element.get("content", element.get("text", ""))
|
|
break
|
|
elif element_type == "extracted_text":
|
|
# Use extracted text as heading if no heading element found
|
|
content = element.get("content", "")
|
|
if content and not headingText:
|
|
# Extract first line or title from extracted text
|
|
headingText = content.split('\n')[0].strip()
|
|
# Remove markdown formatting
|
|
headingText = headingText.replace('#', '').replace('**', '').strip()
|
|
break
|
|
elif "text" in element:
|
|
headingText = element.get("text", "")
|
|
break
|
|
if headingText:
|
|
return self._renderJsonHeading({"text": headingText, "level": 2}, styles)
|
|
return self._renderJsonHeading(sectionData, styles)
|
|
elif sectionType == "paragraph":
|
|
# Process paragraph elements, including extracted_text
|
|
if isinstance(sectionData, list):
|
|
htmlParts = []
|
|
for element in sectionData:
|
|
element_type = element.get("type", "") if isinstance(element, dict) else ""
|
|
|
|
if element_type == "reference":
|
|
doc_ref = element.get("documentReference", "")
|
|
label = element.get("label", "Reference")
|
|
htmlParts.append(f'<p class="reference"><em>[Reference: {label}]</em></p>')
|
|
elif element_type == "extracted_text":
|
|
content = element.get("content", "")
|
|
source = element.get("source", "")
|
|
if content:
|
|
source_text = f' <small><em>(Source: {source})</em></small>' if source else ''
|
|
htmlParts.append(f'<p class="extracted-text">{content}{source_text}</p>')
|
|
elif isinstance(element, dict):
|
|
# Regular paragraph element
|
|
text = element.get("text", element.get("content", ""))
|
|
if text:
|
|
htmlParts.append(f'<p>{text}</p>')
|
|
elif isinstance(element, str):
|
|
htmlParts.append(f'<p>{element}</p>')
|
|
|
|
if htmlParts:
|
|
return '\n'.join(htmlParts)
|
|
return self._renderJsonParagraph(sectionData, styles)
|
|
elif sectionType == "code_block":
|
|
# Process the section data to extract code block structure
|
|
processedData = self._processSectionByType(section)
|
|
return self._renderJsonCodeBlock(processedData, styles)
|
|
elif sectionType == "image":
|
|
# Process the section data to extract image structure
|
|
processedData = self._processSectionByType(section)
|
|
return self._renderJsonImage(processedData, styles)
|
|
else:
|
|
# Fallback: Check for special element types first
|
|
if isinstance(sectionData, list):
|
|
htmlParts = []
|
|
for element in sectionData:
|
|
element_type = element.get("type", "") if isinstance(element, dict) else ""
|
|
|
|
if element_type == "reference":
|
|
doc_ref = element.get("documentReference", "")
|
|
label = element.get("label", "Reference")
|
|
htmlParts.append(f'<p class="reference"><em>[Reference: {label}]</em></p>')
|
|
elif element_type == "extracted_text":
|
|
content = element.get("content", "")
|
|
source = element.get("source", "")
|
|
if content:
|
|
source_text = f' <small><em>(Source: {source})</em></small>' if source else ''
|
|
htmlParts.append(f'<p class="extracted-text">{content}{source_text}</p>')
|
|
|
|
if htmlParts:
|
|
return '\n'.join(htmlParts)
|
|
# Fallback to paragraph for unknown types
|
|
return self._renderJsonParagraph(sectionData, styles)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}")
|
|
return f'<div class="error">[Error rendering section: {str(e)}]</div>'
|
|
|
|
def _renderJsonTable(self, tableData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
|
"""Render a JSON table to HTML using AI-generated styles."""
|
|
try:
|
|
headers = tableData.get("headers", [])
|
|
rows = tableData.get("rows", [])
|
|
|
|
if not headers or not rows:
|
|
return ""
|
|
|
|
htmlParts = ['<table>']
|
|
|
|
# Table header
|
|
htmlParts.append('<thead><tr>')
|
|
for header in headers:
|
|
htmlParts.append(f'<th>{header}</th>')
|
|
htmlParts.append('</tr></thead>')
|
|
|
|
# Table body
|
|
htmlParts.append('<tbody>')
|
|
for row in rows:
|
|
htmlParts.append('<tr>')
|
|
for cellData in row:
|
|
htmlParts.append(f'<td>{cellData}</td>')
|
|
htmlParts.append('</tr>')
|
|
htmlParts.append('</tbody>')
|
|
|
|
htmlParts.append('</table>')
|
|
return '\n'.join(htmlParts)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering table: {str(e)}")
|
|
return ""
|
|
|
|
def _renderJsonBulletList(self, listData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
|
"""Render a JSON bullet list to HTML using AI-generated styles."""
|
|
try:
|
|
items = listData.get("items", [])
|
|
|
|
if not items:
|
|
return ""
|
|
|
|
htmlParts = ['<ul>']
|
|
for item in items:
|
|
if isinstance(item, str):
|
|
htmlParts.append(f'<li>{item}</li>')
|
|
elif isinstance(item, dict) and "text" in item:
|
|
htmlParts.append(f'<li>{item["text"]}</li>')
|
|
htmlParts.append('</ul>')
|
|
|
|
return '\n'.join(htmlParts)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
|
return ""
|
|
|
|
def _renderJsonHeading(self, headingData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
|
"""Render a JSON heading to HTML using AI-generated styles."""
|
|
try:
|
|
# Normalize inputs - headingData is typically a list of elements from _getSectionData
|
|
if isinstance(headingData, list):
|
|
# Extract first element from elements array
|
|
if headingData and len(headingData) > 0:
|
|
headingData = headingData[0] if isinstance(headingData[0], dict) else {}
|
|
else:
|
|
return ""
|
|
elif isinstance(headingData, str):
|
|
headingData = {"text": headingData, "level": 2}
|
|
elif not isinstance(headingData, dict):
|
|
return ""
|
|
|
|
level = headingData.get("level", 1)
|
|
text = headingData.get("text", "")
|
|
|
|
if text:
|
|
level = max(1, min(6, level))
|
|
return f'<h{level}>{text}</h{level}>'
|
|
|
|
return ""
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering heading: {str(e)}")
|
|
return ""
|
|
|
|
def _renderJsonParagraph(self, paragraphData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
|
"""Render a JSON paragraph to HTML using AI-generated styles."""
|
|
try:
|
|
# Normalize inputs - paragraphData is typically a list of elements from _getSectionData
|
|
if isinstance(paragraphData, list):
|
|
# Extract text from all paragraph elements
|
|
texts = []
|
|
for el in paragraphData:
|
|
if isinstance(el, dict) and "text" in el:
|
|
texts.append(el["text"])
|
|
elif isinstance(el, str):
|
|
texts.append(el)
|
|
if texts:
|
|
# Join multiple paragraphs with <p> tags
|
|
return '\n'.join(f'<p>{text}</p>' for text in texts)
|
|
return ""
|
|
elif isinstance(paragraphData, str):
|
|
return f'<p>{paragraphData}</p>'
|
|
elif isinstance(paragraphData, dict):
|
|
text = paragraphData.get("text", "")
|
|
if text:
|
|
return f'<p>{text}</p>'
|
|
return ""
|
|
else:
|
|
return ""
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
|
return ""
|
|
|
|
def _renderJsonCodeBlock(self, codeData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
|
"""Render a JSON code block to HTML using AI-generated styles."""
|
|
try:
|
|
code = codeData.get("code", "")
|
|
language = codeData.get("language", "")
|
|
|
|
if code:
|
|
if language:
|
|
return f'<pre><code class="language-{language}">{code}</code></pre>'
|
|
else:
|
|
return f'<pre><code>{code}</code></pre>'
|
|
|
|
return ""
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering code block: {str(e)}")
|
|
return ""
|
|
|
|
def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str:
|
|
"""Render a JSON image to HTML with placeholder for later replacement."""
|
|
try:
|
|
base64Data = imageData.get("base64Data", "")
|
|
altText = imageData.get("altText", "Image")
|
|
caption = imageData.get("caption", "")
|
|
|
|
if base64Data:
|
|
# Use data URI as placeholder - will be replaced with file path in _replaceImageDataUris
|
|
# Include a marker so we can find and replace it
|
|
imageMarker = f"<!--IMAGE_MARKER:{len(base64Data)}:{altText[:50]}-->"
|
|
imgTag = f'<img src="data:image/png;base64,{base64Data}" alt="{altText}">'
|
|
|
|
if caption:
|
|
return f'{imageMarker}<figure>{imgTag}<figcaption>{caption}</figcaption></figure>'
|
|
else:
|
|
return f'{imageMarker}{imgTag}'
|
|
|
|
return ""
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error rendering image: {str(e)}")
|
|
return f'<div class="error">[Image: {imageData.get("altText", "Image")}]</div>'
|
|
|
|
def _extractImages(self, jsonContent: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract all images from JSON structure.
|
|
|
|
Returns:
|
|
List of image data dictionaries with base64Data, altText, caption, sectionId
|
|
"""
|
|
images = []
|
|
|
|
try:
|
|
# Extract from standardized schema: {metadata: {...}, documents: [{sections: [...]}]}
|
|
documents = jsonContent.get("documents", [])
|
|
if not documents or not isinstance(documents, list):
|
|
return images
|
|
|
|
for doc in documents:
|
|
if not isinstance(doc, dict):
|
|
continue
|
|
sections = doc.get("sections", [])
|
|
for section in sections:
|
|
if section.get("content_type") == "image":
|
|
elements = section.get("elements", [])
|
|
for element in elements:
|
|
base64Data = element.get("base64Data", "")
|
|
|
|
# If base64Data not found, try extracting from url data URI
|
|
if not base64Data:
|
|
url = element.get("url", "")
|
|
if url.startswith("data:image/"):
|
|
# Extract base64 from data URI: data:image/png;base64,<base64>
|
|
import re
|
|
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
|
|
if match:
|
|
base64Data = match.group(1)
|
|
|
|
if base64Data:
|
|
sectionId = section.get("id", "unknown")
|
|
# Generate filename from section ID
|
|
filename = f"{sectionId}.png"
|
|
# Clean filename (remove invalid characters)
|
|
filename = "".join(c if c.isalnum() or c in "._-" else "_" for c in filename)
|
|
|
|
images.append({
|
|
"base64Data": base64Data,
|
|
"altText": element.get("altText", "Image"),
|
|
"caption": element.get("caption"),
|
|
"sectionId": sectionId,
|
|
"filename": filename
|
|
})
|
|
self.logger.debug(f"Extracted image from section {sectionId}: {filename}")
|
|
|
|
self.logger.info(f"Extracted {len(images)} image(s) from JSON structure")
|
|
return images
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error extracting images: {str(e)}")
|
|
return []
|
|
|
|
def _replaceImageDataUris(self, htmlContent: str, images: List[Dict[str, Any]]) -> str:
|
|
"""
|
|
Replace base64 data URIs in HTML with relative file paths.
|
|
|
|
Args:
|
|
htmlContent: HTML content with data URIs
|
|
images: List of image data dictionaries
|
|
|
|
Returns:
|
|
HTML content with relative file paths
|
|
"""
|
|
try:
|
|
import base64
|
|
import re
|
|
|
|
# Find all image data URIs in HTML
|
|
dataUriPattern = r'data:image/png;base64,([A-Za-z0-9+/=]+)'
|
|
|
|
def replaceDataUri(match):
|
|
base64Data = match.group(1)
|
|
|
|
# Find matching image in images list
|
|
matchingImage = None
|
|
for img in images:
|
|
if img["base64Data"] == base64Data or img["base64Data"].startswith(base64Data[:100]):
|
|
matchingImage = img
|
|
break
|
|
|
|
if matchingImage:
|
|
# Use filename from image data (generated from section ID)
|
|
filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png")
|
|
|
|
# Replace with relative path
|
|
altText = matchingImage.get("altText", "Image")
|
|
caption = matchingImage.get("caption", "")
|
|
|
|
if caption:
|
|
return f'<figure><img src="{filename}" alt="{altText}"><figcaption>{caption}</figcaption></figure>'
|
|
else:
|
|
return f'<img src="{filename}" alt="{altText}">'
|
|
else:
|
|
# Keep original if no match found
|
|
return match.group(0)
|
|
|
|
# Replace all data URIs
|
|
updatedHtml = re.sub(dataUriPattern, replaceDataUri, htmlContent)
|
|
|
|
return updatedHtml
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error replacing image data URIs: {str(e)}")
|
|
return htmlContent # Return original if replacement fails
|
|
|
|
def getRenderedImages(self) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get images that were extracted during rendering.
|
|
Returns list of image dicts with base64Data, altText, caption, and filename.
|
|
"""
|
|
if not hasattr(self, '_renderedImages'):
|
|
return []
|
|
return self._renderedImages
|