')
if htmlParts:
return '\n'.join(htmlParts)
return self._renderJsonParagraph(sectionData, styles)
elif sectionType == "code_block":
# Process the section data to extract code block structure
processedData = self._processSectionByType(section)
return self._renderJsonCodeBlock(processedData, styles)
elif sectionType == "image":
# Process the section data to extract image structure
processedData = self._processSectionByType(section)
return self._renderJsonImage(processedData, styles)
else:
# Fallback: Check for special element types first
if isinstance(sectionData, list):
htmlParts = []
for element in sectionData:
element_type = element.get("type", "") if isinstance(element, dict) else ""
if element_type == "reference":
doc_ref = element.get("documentReference", "")
label = element.get("label", "Reference")
htmlParts.append(f'
[Reference: {label}]
')
elif element_type == "extracted_text":
content = element.get("content", "")
source = element.get("source", "")
if content:
source_text = f' (Source: {source})' if source else ''
htmlParts.append(f'
{content}{source_text}
')
if htmlParts:
return '\n'.join(htmlParts)
# Fallback to paragraph for unknown types
return self._renderJsonParagraph(sectionData, styles)
except Exception as e:
self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}")
return f'
[Error rendering section: {str(e)}]
'
def _renderJsonTable(self, tableData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON table to HTML using AI-generated styles."""
try:
headers = tableData.get("headers", [])
rows = tableData.get("rows", [])
if not headers or not rows:
return ""
htmlParts = ['
']
# Table header
htmlParts.append('
')
for header in headers:
htmlParts.append(f'
{header}
')
htmlParts.append('
')
# Table body
htmlParts.append('')
for row in rows:
htmlParts.append('
')
for cellData in row:
htmlParts.append(f'
{cellData}
')
htmlParts.append('
')
htmlParts.append('')
htmlParts.append('
')
return '\n'.join(htmlParts)
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return ""
def _renderJsonBulletList(self, listData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON bullet list to HTML using AI-generated styles."""
try:
items = listData.get("items", [])
if not items:
return ""
htmlParts = ['
']
for item in items:
if isinstance(item, str):
htmlParts.append(f'
{item}
')
elif isinstance(item, dict) and "text" in item:
htmlParts.append(f'
{item["text"]}
')
htmlParts.append('
')
return '\n'.join(htmlParts)
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
return ""
def _renderJsonHeading(self, headingData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON heading to HTML using AI-generated styles."""
try:
# Normalize inputs - headingData is typically a list of elements from _getSectionData
if isinstance(headingData, list):
# Extract first element from elements array
if headingData and len(headingData) > 0:
headingData = headingData[0] if isinstance(headingData[0], dict) else {}
else:
return ""
elif isinstance(headingData, str):
headingData = {"text": headingData, "level": 2}
elif not isinstance(headingData, dict):
return ""
level = headingData.get("level", 1)
text = headingData.get("text", "")
if text:
level = max(1, min(6, level))
return f'{text}'
return ""
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
return ""
def _renderJsonParagraph(self, paragraphData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON paragraph to HTML using AI-generated styles."""
try:
# Normalize inputs - paragraphData is typically a list of elements from _getSectionData
if isinstance(paragraphData, list):
# Extract text from all paragraph elements
texts = []
for el in paragraphData:
if isinstance(el, dict) and "text" in el:
texts.append(el["text"])
elif isinstance(el, str):
texts.append(el)
if texts:
# Join multiple paragraphs with
tags
return '\n'.join(f'
{text}
' for text in texts)
return ""
elif isinstance(paragraphData, str):
return f'
{paragraphData}
'
elif isinstance(paragraphData, dict):
text = paragraphData.get("text", "")
if text:
return f'
{text}
'
return ""
else:
return ""
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return ""
def _renderJsonCodeBlock(self, codeData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON code block to HTML using AI-generated styles."""
try:
code = codeData.get("code", "")
language = codeData.get("language", "")
if code:
if language:
return f'
{code}
'
else:
return f'
{code}
'
return ""
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
return ""
def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON image to HTML with placeholder for later replacement."""
try:
import html
base64Data = imageData.get("base64Data", "")
altText = imageData.get("altText", "Image")
caption = imageData.get("caption", "")
# Escape HTML in altText and caption to prevent injection
altTextEscaped = html.escape(str(altText))
captionEscaped = html.escape(str(caption)) if caption else ""
if base64Data:
# Use data URI as placeholder - will be replaced with file path in _replaceImageDataUris
# Include a marker so we can find and replace it
imageMarker = f""
imgTag = f''
if captionEscaped:
return f'{imageMarker}{imgTag}{captionEscaped}'
else:
return f'{imageMarker}{imgTag}'
return ""
except Exception as e:
self.logger.warning(f"Error rendering image: {str(e)}")
return f'
[Image: {imageData.get("altText", "Image")}]
'
def _extractImages(self, jsonContent: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Extract all images from JSON structure.
Returns:
List of image data dictionaries with base64Data, altText, caption, sectionId
"""
images = []
try:
# Extract from standardized schema: {metadata: {...}, documents: [{sections: [...]}]}
documents = jsonContent.get("documents", [])
if not documents or not isinstance(documents, list):
return images
for doc in documents:
if not isinstance(doc, dict):
continue
sections = doc.get("sections", [])
for section in sections:
if section.get("content_type") == "image":
elements = section.get("elements", [])
for element in elements:
base64Data = element.get("base64Data", "")
# If base64Data not found, try extracting from url data URI
if not base64Data:
url = element.get("url", "")
if url.startswith("data:image/"):
# Extract base64 from data URI: data:image/png;base64,
import re
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
if match:
base64Data = match.group(1)
if base64Data:
sectionId = section.get("id", "unknown")
# Bestimme MIME-Type und Extension
mimeType = element.get("mimeType", "image/png")
if not mimeType or mimeType == "unknown":
# Versuche MIME-Type aus base64 zu erkennen
if base64Data.startswith("/9j/"):
mimeType = "image/jpeg"
elif base64Data.startswith("iVBORw0KGgo"):
mimeType = "image/png"
else:
mimeType = "image/png" # Default
# Bestimme Extension basierend auf MIME-Type
extension = "png"
if mimeType == "image/jpeg" or mimeType == "image/jpg":
extension = "jpg"
elif mimeType == "image/png":
extension = "png"
elif mimeType == "image/gif":
extension = "gif"
elif mimeType == "image/webp":
extension = "webp"
# Generate filename from section ID
filename = f"{sectionId}.{extension}"
# Clean filename (remove invalid characters)
filename = "".join(c if c.isalnum() or c in "._-" else "_" for c in filename)
images.append({
"base64Data": base64Data,
"altText": element.get("altText", "Image"),
"caption": element.get("caption"),
"sectionId": sectionId,
"filename": filename,
"mimeType": mimeType
})
self.logger.debug(f"Extracted image from section {sectionId}: {filename}")
self.logger.info(f"Extracted {len(images)} image(s) from JSON structure")
return images
except Exception as e:
self.logger.warning(f"Error extracting images: {str(e)}")
return []
def _replaceImageDataUris(self, htmlContent: str, images: List[Dict[str, Any]]) -> str:
"""
Replace base64 data URIs in HTML with relative file paths.
Args:
htmlContent: HTML content with data URIs
images: List of image data dictionaries
Returns:
HTML content with relative file paths
"""
try:
import base64
import re
# Find entire img tags with data URIs and replace them
# Pattern:
imgTagPattern = r']*>'
def replaceImgTag(match):
imgTag = match.group(0)
# Extract base64 data from the img tag
base64Match = re.search(r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)', imgTag)
if not base64Match:
return imgTag # Return original if no base64 found
base64Data = base64Match.group(1)
# Find matching image in images list
matchingImage = None
for img in images:
imgBase64 = img.get("base64Data", "")
# Vergleiche base64-Daten (kann unterschiedliche Längen haben durch Padding)
if imgBase64 == base64Data or imgBase64.startswith(base64Data[:100]) or base64Data.startswith(imgBase64[:100]):
matchingImage = img
break
if matchingImage:
import html
# Use filename from image data (generated from section ID)
filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png")
# Extract existing alt text or use from matchingImage
altMatch = re.search(r'alt="([^"]*)"', imgTag)
existingAlt = altMatch.group(1) if altMatch else ""
altText = html.escape(str(matchingImage.get("altText", existingAlt or "Image")))
caption = html.escape(str(matchingImage.get("caption", ""))) if matchingImage.get("caption") else ""
# Create new img tag with filename
imgTag = f''
if caption:
return f'{imgTag}{caption}'
else:
return imgTag
else:
# Keep original if no match found
return match.group(0)
# Replace all img tags with data URIs (auch IMAGE_MARKER Kommentare entfernen)
updatedHtml = re.sub(imgTagPattern, replaceImgTag, htmlContent)
# Entferne IMAGE_MARKER Kommentare die übrig geblieben sind
updatedHtml = re.sub(r'', '', updatedHtml)
return updatedHtml
except Exception as e:
self.logger.warning(f"Error replacing image data URIs: {str(e)}")
return htmlContent # Return original if replacement fails
def getRenderedImages(self) -> List[Dict[str, Any]]:
"""
Get images that were extracted during rendering.
Returns list of image dicts with base64Data, altText, caption, and filename.
"""
if not hasattr(self, '_renderedImages'):
return []
return self._renderedImages