gateway/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py
2026-03-22 11:09:48 +01:00

1131 lines
No EOL
54 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
PDF renderer for report generation using reportlab.
"""
from __future__ import annotations
import unicodedata
from .documentRendererBaseTemplate import BaseRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional
import io
import base64
try:
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Preformatted
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
REPORTLAB_AVAILABLE = True
except ImportError:
REPORTLAB_AVAILABLE = False
import re as _re_pdf
# A4 width in pt; margins must match SimpleDocTemplate(leftMargin/rightMargin)
_PDF_MARGIN_LR_PT = 72.0
_PDF_A4_WIDTH_PT = 595.27
_PDF_CONTENT_WIDTH_PT = _PDF_A4_WIDTH_PT - (2 * _PDF_MARGIN_LR_PT)
def _boxDrawingCharToAscii(ch: str) -> str:
"""Map one box-drawing character to ASCII (Courier has no glyphs for U+2500U+257F)."""
nm = unicodedata.name(ch, "")
v = "VERTICAL" in nm
h = "HORIZONTAL" in nm
and_ = "AND" in nm
if v and h:
return "+"
if v and not h and not and_:
return "|"
if h and not v and not and_:
return "-"
return "+"
def _normalizePdfMonospaceText(text: str) -> str:
"""Replace Unicode box/block drawing with ASCII so PDF core fonts render readable code/trees."""
if not text:
return ""
out: List[str] = []
for ch in text:
o = ord(ch)
if 0x2500 <= o <= 0x257F:
out.append(_boxDrawingCharToAscii(ch))
elif 0x2580 <= o <= 0x259F:
out.append("#")
else:
out.append(ch)
return "".join(out)
def _prepareCodeBlockPlainText(text: str) -> str:
"""Normalize newlines/tabs for preformatted code (no HTML/XML; spaces must stay significant)."""
if not text:
return ""
text = text.replace("\r\n", "\n").replace("\r", "\n")
return text.expandtabs(4)
class RendererPdf(BaseRenderer):
"""Renders content to PDF format using reportlab."""
@classmethod
def getSupportedFormats(cls) -> List[str]:
"""Return supported PDF formats."""
return ['pdf']
@classmethod
def getFormatAliases(cls) -> List[str]:
"""Return format aliases."""
return ['document', 'print']
@classmethod
def getPriority(cls) -> int:
"""Return priority for PDF renderer."""
return 120
@classmethod
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
"""Return output style classification: PDF documents are formatted documents."""
return 'document'
@classmethod
def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]:
"""
Return list of section content types that PDF renderer accepts.
PDF renderer accepts all section types (PDF documents can contain all content types).
"""
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
"""Render extracted JSON content to PDF format using AI-analyzed styling."""
try:
if not REPORTLAB_AVAILABLE:
# Fallback to HTML if reportlab not available
from .rendererHtml import RendererHtml
html_renderer = RendererHtml()
return await html_renderer.render(extractedContent, title, userPrompt, aiService)
# Generate PDF using AI-analyzed styling
pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService)
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
# Determine filename from document or title
documents = extractedContent.get("documents", [])
if documents and isinstance(documents[0], dict):
filename = documents[0].get("filename")
if not filename:
filename = self._determineFilename(title, "application/pdf")
else:
filename = self._determineFilename(title, "application/pdf")
# Convert PDF content to bytes if it's a string (base64)
if isinstance(pdf_content, str):
# Try to decode as base64, otherwise encode as UTF-8
try:
pdf_bytes = base64.b64decode(pdf_content)
except Exception:
pdf_bytes = pdf_content.encode('utf-8')
else:
pdf_bytes = pdf_content
return [
RenderedDocument(
documentData=pdf_bytes,
mimeType="application/pdf",
filename=filename,
documentType=documentType,
metadata=metadata if isinstance(metadata, dict) else None
)
]
except Exception as e:
self.logger.error(f"Error rendering PDF: {str(e)}")
# Return minimal fallback
fallbackContent = f"PDF Generation Error: {str(e)}"
return [
RenderedDocument(
documentData=fallbackContent.encode('utf-8'),
mimeType="text/plain",
filename=self._determineFilename(title, "text/plain")
)
]
async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
"""Generate PDF content from structured JSON document using AI-generated styling."""
try:
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(json_content, userPrompt, aiService)
# Validate JSON structure
if not self._validateJsonStructure(json_content):
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
# Extract sections and metadata from standardized schema
sections = self._extractSections(json_content)
# Create a buffer to hold the PDF
buffer = io.BytesIO()
# Create PDF document
doc = SimpleDocTemplate(
buffer,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=18
)
# Build PDF content (no cover page — body starts on page 1; filename still uses `title`)
story = []
# Process each section (sections already extracted above)
self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER")
for i, section in enumerate(sections):
self.services.utils.debugLogToFile(f"PDF SECTION {i}: content_type={section.get('content_type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER")
section_elements = self._renderJsonSection(section, styles)
self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER")
story.extend(section_elements)
# Build PDF — retry with oversized flowables removed on LayoutError
self._buildPdfWithOverflowGuard(doc, story, buffer)
buffer.seek(0)
pdf_bytes = buffer.getvalue()
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
return pdf_base64
except Exception as e:
self.logger.error(f"Error generating PDF from JSON: {str(e)}")
raise Exception(f"PDF generation failed: {str(e)}")
def _buildPdfWithOverflowGuard(self, doc, story: List[Any], buffer) -> None:
"""Try doc.build(); on 'too large on page' LayoutError, drop the offending
flowable, log a warning, and retry (up to 5 times)."""
maxRetries = 5
for attempt in range(maxRetries + 1):
try:
buffer.seek(0)
buffer.truncate()
doc.build(story)
return
except Exception as e:
msg = str(e)
if "too large on page" not in msg or attempt == maxRetries:
raise
# Identify the offending flowable from the error repr
self.logger.warning(f"PDF overflow (attempt {attempt + 1}): {msg} — removing oversized element and retrying")
removed = False
for idx, flowable in enumerate(story):
fRepr = repr(flowable)
if "Table" in fRepr and hasattr(flowable, '_cellvalues'):
try:
nRows = len(flowable._cellvalues)
nCols = len(flowable._cellvalues[0]) if flowable._cellvalues else 0
if nRows == 1 and nCols == 1:
errPara = Paragraph(
"[Code block omitted — content too large for PDF page]",
self._createNormalStyle({}),
)
story[idx] = errPara
removed = True
break
except Exception:
pass
if not removed:
raise
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
"""Get style set - use styles from document generation metadata if available,
otherwise enhance default styles with AI if userPrompt provided.
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
not be generated separately by renderers. Only fall back to AI if styles not provided.
Args:
extractedContent: Document content with metadata (may contain styles)
userPrompt: User's prompt (AI will detect style instructions in any language)
aiService: AI service (used only if styles not in metadata and userPrompt provided)
templateName: Name of template style set (None = default)
Returns:
Dict with style definitions for all document styles
"""
# Get default style set
defaultStyleSet = self._getDefaultStyleSet()
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
if extractedContent:
metadata = extractedContent.get("metadata", {})
if isinstance(metadata, dict):
styles = metadata.get("styles")
if styles and isinstance(styles, dict):
self.logger.debug("Using styles from document generation metadata")
enhancedStyleSet = self._convertColorsFormat(styles)
return self._validateStylesContrast(enhancedStyleSet)
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
if userPrompt and aiService:
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
# Convert colors to PDF format after getting styles
enhancedStyleSet = self._convertColorsFormat(enhancedStyleSet)
return self._validateStylesContrast(enhancedStyleSet)
else:
# Use default styles only
return defaultStyleSet
async def _enhanceStylesWithAI(self, userPrompt: str, defaultStyleSet: Dict[str, Any], aiService) -> Dict[str, Any]:
"""Enhance default styles with AI based on user prompt."""
try:
style_template = self._createAiStyleTemplate("pdf", userPrompt, defaultStyleSet)
enhanced_styles = await self._getAiStyles(aiService, style_template, defaultStyleSet)
return enhanced_styles
except Exception as e:
self.logger.warning(f"AI style enhancement failed: {str(e)}, using default styles")
return defaultStyleSet
def _validateStylesContrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Validate and fix contrast issues in AI-generated styles."""
try:
# Fix table header contrast
if "table_header" in styles:
header = styles["table_header"]
bg_color = header.get("background", "#FFFFFF")
text_color = header.get("text_color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
header["background"] = "#4F4F4F"
header["text_color"] = "#FFFFFF"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
header["background"] = "#4F4F4F"
header["text_color"] = "#FFFFFF"
# Fix table cell contrast
if "table_cell" in styles:
cell = styles["table_cell"]
bg_color = cell.get("background", "#FFFFFF")
text_color = cell.get("text_color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
cell["background"] = "#FFFFFF"
cell["text_color"] = "#2F2F2F"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
cell["background"] = "#FFFFFF"
cell["text_color"] = "#2F2F2F"
return styles
except Exception as e:
self.logger.warning(f"Style validation failed: {str(e)}")
return self._getDefaultStyleSet()
def _getDefaultStyleSet(self) -> Dict[str, Any]:
"""Default PDF style set - used when no style instructions present."""
return {
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30},
# Markdown #..###### — sizes must strictly decrease (H1 largest … H6 smallest).
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12},
"heading2": {"font_size": 15, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 10, "space_before": 10},
"heading3": {"font_size": 13, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8},
"heading4": {"font_size": 12, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 6, "space_before": 6},
"heading5": {"font_size": 11, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 6, "space_before": 6},
"heading6": {"font_size": 10, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 4, "space_before": 4},
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2},
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "left", "font_size": 12},
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10},
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3},
"code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6, "align": "left"}
}
async def _getAiStylesWithPdfColors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
"""Get AI styles with proper PDF color conversion."""
if not ai_service:
return default_styles
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
request_options = AiCallOptions()
request_options.operationType = OperationTypeEnum.DATA_GENERATE
request = AiCallRequest(prompt=style_template, context="", options=request_options)
# Check if AI service is properly configured
if not hasattr(ai_service, 'aiObjects') or not ai_service.aiObjects:
self.logger.warning("AI service not properly configured, using defaults")
return default_styles
response = await ai_service.callAi(request)
# Check if response is valid
if not response:
self.logger.warning("AI service returned no response, using defaults")
return default_styles
import json
import re
# Clean and parse JSON
result = response.content.strip() if response and response.content else ""
# Check if result is empty
if not result:
self.logger.warning("AI styling returned empty response, using defaults")
return default_styles
# Log the raw response for debugging
self.logger.debug(f"AI styling raw response: {result[:200]}...")
# Extract JSON from various formats
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
if json_match:
result = json_match.group(1).strip()
elif result.startswith('```json'):
result = re.sub(r'^```json\s*', '', result)
result = re.sub(r'\s*```$', '', result)
elif result.startswith('```'):
result = re.sub(r'^```\s*', '', result)
result = re.sub(r'\s*```$', '', result)
# Try to extract JSON from explanatory text
json_patterns = [
r'\{[^{}]*"title"[^{}]*\}', # Simple JSON object
r'\{.*?"title".*?\}', # JSON with title field
r'\{.*?"font_size".*?\}', # JSON with font_size field
]
for pattern in json_patterns:
json_match = re.search(pattern, result, re.DOTALL)
if json_match:
result = json_match.group(0)
break
# Additional cleanup - remove any leading/trailing whitespace and newlines
result = result.strip()
# Check if result is still empty after cleanup
if not result:
self.logger.warning("AI styling returned empty content after cleanup, using defaults")
return default_styles
# Try to parse JSON
try:
styles = json.loads(result)
self.logger.debug(f"Successfully parsed AI styles: {list(styles.keys())}")
except json.JSONDecodeError as json_error:
self.logger.warning(f"AI styling returned invalid JSON: {json_error}")
# Use print instead of logger to avoid truncation
self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "PDF_RENDERER")
self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "PDF_RENDERER")
self.logger.warning(f"Raw content that failed to parse: {result}")
# Try to fix incomplete JSON by adding missing closing braces
open_braces = result.count('{')
close_braces = result.count('}')
if open_braces > close_braces:
# JSON is incomplete, add missing closing braces
missing_braces = open_braces - close_braces
result = result + '}' * missing_braces
self.logger.info(f"Added {missing_braces} missing closing brace(s)")
# Try parsing the fixed JSON
try:
styles = json.loads(result)
self.logger.info("Successfully fixed incomplete JSON")
except json.JSONDecodeError as fix_error:
self.logger.warning(f"Fixed JSON still invalid: {fix_error}")
# Try to extract just the JSON part if it's embedded in text
json_start = result.find('{')
json_end = result.rfind('}')
if json_start != -1 and json_end != -1 and json_end > json_start:
json_part = result[json_start:json_end+1]
try:
styles = json.loads(json_part)
self.logger.info("Successfully extracted JSON from explanatory text")
except json.JSONDecodeError:
self.logger.warning("Could not extract valid JSON from response, using defaults")
return default_styles
else:
return default_styles
else:
# Try to extract just the JSON part if it's embedded in text
json_start = result.find('{')
json_end = result.rfind('}')
if json_start != -1 and json_end != -1 and json_end > json_start:
json_part = result[json_start:json_end+1]
try:
styles = json.loads(json_part)
self.logger.info("Successfully extracted JSON from explanatory text")
except json.JSONDecodeError:
self.logger.warning("Could not extract valid JSON from response, using defaults")
return default_styles
else:
return default_styles
# Convert colors to PDF format (keep as hex strings, PDF renderer will convert them)
styles = self._convertColorsFormat(styles)
return styles
except Exception as e:
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
return default_styles
def _convertColorsFormat(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Convert colors to proper format for PDF compatibility."""
try:
for style_name, style_config in styles.items():
if isinstance(style_config, dict):
for prop, value in style_config.items():
if isinstance(value, str) and value.startswith('#') and len(value) == 7:
# Convert #RRGGBB to #AARRGGBB (add FF alpha channel) for consistency
styles[style_name][prop] = f"FF{value[1:]}"
elif isinstance(value, str) and value.startswith('#') and len(value) == 9:
# Already aRGB format, keep as is
pass
return styles
except Exception as e:
self.logger.warning(f"Color conversion failed: {str(e)}")
return styles
def _getSafeColor(self, color_value: str, default: str = "#000000") -> str:
"""Get a safe hex color value for PDF."""
if isinstance(color_value, str) and color_value.startswith('#'):
if len(color_value) == 7:
return f"FF{color_value[1:]}"
elif len(color_value) == 9:
return color_value
return default
def _defaultHeadingStyleDef(self, level: int) -> Dict[str, Any]:
"""When heading{N} is missing from styles, never fall back to heading1 (that made H3 > H2)."""
sizes = {1: 18, 2: 15, 3: 13, 4: 12, 5: 11, 6: 10}
fs = sizes.get(level, 10)
sb = max(4, 14 - level)
return {
"font_size": fs,
"color": "#2F2F2F" if level <= 2 else "#4F4F4F",
"bold": True,
"align": "left",
"space_after": sb,
"space_before": sb,
}
def _createHeadingStyle(self, styles: Dict[str, Any], level: int) -> ParagraphStyle:
"""Create heading style from style definitions."""
heading_key = f"heading{level}"
heading_style_def = styles.get(heading_key) or self._defaultHeadingStyleDef(level)
fs = heading_style_def.get("font_size", self._defaultHeadingStyleDef(level)["font_size"])
bold = heading_style_def.get("bold", True)
return ParagraphStyle(
f'CustomHeading{level}',
fontName="Helvetica-Bold" if bold else "Helvetica",
fontSize=fs,
spaceAfter=heading_style_def.get("space_after", 12),
spaceBefore=heading_style_def.get("space_before", 12),
alignment=self._getAlignment(heading_style_def.get("align", "left")),
textColor=self._hexToColor(heading_style_def.get("color", "#2F2F2F")),
leading=fs * 1.35,
)
def _createNormalStyle(self, styles: Dict[str, Any]) -> ParagraphStyle:
"""Create normal paragraph style from style definitions."""
paragraph_style_def = styles.get("paragraph", {})
return ParagraphStyle(
'CustomNormal',
fontSize=paragraph_style_def.get("font_size", 11),
spaceAfter=paragraph_style_def.get("space_after", 6),
alignment=self._getAlignment(paragraph_style_def.get("align", "left")),
textColor=self._hexToColor(paragraph_style_def.get("color", "#2F2F2F")),
leading=paragraph_style_def.get("line_height", 1.2) * paragraph_style_def.get("font_size", 11)
)
def _getAlignment(self, align: str) -> int:
"""Convert alignment string to reportlab alignment constant."""
if not align or not isinstance(align, str):
return TA_LEFT
align_map = {
"center": TA_CENTER,
"left": TA_LEFT,
"justify": TA_JUSTIFY,
"right": TA_LEFT, # ReportLab doesn't have TA_RIGHT, use LEFT as fallback
"0": TA_LEFT, # Handle numeric strings
"1": TA_CENTER,
"2": TA_JUSTIFY
}
return align_map.get(align.lower().strip(), TA_LEFT)
def _hexToColor(self, hex_color: str) -> colors.Color:
"""Convert hex color to reportlab color."""
try:
hex_color = hex_color.lstrip('#')
# Handle aRGB format (8 characters: FF + RGB)
if len(hex_color) == 8:
# Skip the alpha channel (first 2 characters)
hex_color = hex_color[2:]
# Handle RGB format (6 characters)
if len(hex_color) == 6:
r = int(hex_color[0:2], 16) / 255.0
g = int(hex_color[2:4], 16) / 255.0
b = int(hex_color[4:6], 16) / 255.0
return colors.Color(r, g, b)
# Fallback for other formats
return colors.black
except:
return colors.black
def _escapeReportlabXml(self, text: str) -> str:
"""Escape text for ReportLab Paragraph markup."""
if not text:
return ""
return (
text.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
)
def _applyInlineMarkdownToEscapedPlain(self, text: str) -> str:
"""Escape XML then apply bold/italic to a segment with no `code` spans (code is handled separately)."""
if not text:
return ""
s = self._escapeReportlabXml(text)
s = _re_pdf.sub(r"\*\*(.+?)\*\*", r"<b>\1</b>", s, flags=_re_pdf.DOTALL)
s = _re_pdf.sub(r"__(.+?)__", r"<b>\1</b>", s, flags=_re_pdf.DOTALL)
s = _re_pdf.sub(r"(?<!\*)\*([^*\n]+?)\*(?!\*)", r"<i>\1</i>", s)
s = _re_pdf.sub(r"(?<![\w/])_([^_\n]+?)_(?![\w/])", r"<i>\1</i>", s)
return s
def _markdownInlineToReportlabXml(self, text: str) -> str:
"""Turn common markdown inline (**bold**, *italic*, `code`) into ReportLab XML.
Backtick spans are extracted first so paths like `.../<Slug>/...` are not corrupted by
markdown patterns and XML escaping stays well-formed inside <font name=\"Courier\">.
"""
if not text:
return ""
text = _normalizePdfMonospaceText(text)
out: List[str] = []
pos = 0
for m in _re_pdf.finditer(r"`([^`]*)`", text):
before = text[pos:m.start()]
out.append(self._applyInlineMarkdownToEscapedPlain(before))
code = m.group(1)
out.append(f'<font name="Courier">{self._escapeReportlabXml(code)}</font>')
pos = m.end()
out.append(self._applyInlineMarkdownToEscapedPlain(text[pos:]))
return "".join(out)
def _paragraphFromInlineMarkdown(self, text: str, style: ParagraphStyle) -> Paragraph:
return Paragraph(self._markdownInlineToReportlabXml(text), style)
def _createTableCellParagraphStyle(
self, styles: Dict[str, Any], *, header: bool, tableStyleKey: str
) -> ParagraphStyle:
"""Paragraph style for table cells (word wrap within colWidth)."""
tdef = styles.get(tableStyleKey, {})
fs = tdef.get("font_size", 12 if header else 10)
defaultTc = "#FFFFFF" if header else "#2F2F2F"
return ParagraphStyle(
f"TblCell{'H' if header else 'B'}{tableStyleKey}",
fontSize=fs,
leading=fs * 1.25,
alignment=TA_LEFT,
textColor=self._hexToColor(tdef.get("text_color", defaultTc)),
fontName="Helvetica-Bold" if header and tdef.get("bold", True) else "Helvetica",
)
def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a single JSON section to PDF elements using AI-generated styles.
Supports three content formats: reference, object (base64), extracted_text.
"""
try:
section_type = self._getSectionType(section)
elements = self._getSectionData(section)
# Process each element in the section
all_elements = []
for element in elements:
element_type = element.get("type", "") if isinstance(element, dict) else ""
# Support three content formats from Phase 5D
if element_type == "reference":
# Document reference format
doc_ref = element.get("documentReference", "")
label = element.get("label", "Reference")
ref_style = ParagraphStyle(
'Reference',
parent=self._createNormalStyle(styles),
fontStyle='italic',
textColor=colors.grey
)
all_elements.append(Paragraph(f"[Reference: {label}]", ref_style))
all_elements.append(Spacer(1, 6))
continue
elif element_type == "extracted_text":
# Extracted text format
content = element.get("content", "")
source = element.get("source", "")
if content:
bodyXml = self._markdownInlineToReportlabXml(content)
if source:
bodyXml = f"{bodyXml} <i>(Source: {self._escapeReportlabXml(source)})</i>"
all_elements.append(Paragraph(bodyXml, self._createNormalStyle(styles)))
all_elements.append(Spacer(1, 6))
continue
# Check element type, not section type (elements can have different types than section)
if element_type == "table":
all_elements.extend(self._renderJsonTable(element, styles))
elif element_type == "bullet_list":
all_elements.extend(self._renderJsonBulletList(element, styles))
elif element_type == "heading":
all_elements.extend(self._renderJsonHeading(element, styles))
elif element_type == "paragraph":
all_elements.extend(self._renderJsonParagraph(element, styles))
elif element_type == "code_block":
all_elements.extend(self._renderJsonCodeBlock(element, styles))
elif element_type == "image":
all_elements.extend(self._renderJsonImage(element, styles))
else:
# Fallback: if element_type not set, use section_type as fallback
if section_type == "table":
all_elements.extend(self._renderJsonTable(element, styles))
elif section_type == "bullet_list":
all_elements.extend(self._renderJsonBulletList(element, styles))
elif section_type == "heading":
all_elements.extend(self._renderJsonHeading(element, styles))
elif section_type == "paragraph":
all_elements.extend(self._renderJsonParagraph(element, styles))
elif section_type == "code_block":
all_elements.extend(self._renderJsonCodeBlock(element, styles))
elif section_type == "image":
all_elements.extend(self._renderJsonImage(element, styles))
else:
# Final fallback to paragraph for unknown types
all_elements.extend(self._renderJsonParagraph(element, styles))
return all_elements
except Exception as e:
self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}")
return [Paragraph(f"[Error rendering section: {str(e)}]", self._createNormalStyle(styles))]
def _renderJsonTable(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON table: left-aligned, width capped to printable area, cells wrap."""
try:
content = table_data.get("content", {})
if not isinstance(content, dict):
return []
headers = content.get("headers", [])
rows = content.get("rows", [])
if not headers or not rows:
return []
numCols = len(headers)
colWidth = _PDF_CONTENT_WIDTH_PT / max(numCols, 1)
colWidths = [colWidth] * numCols
hdrPs = self._createTableCellParagraphStyle(styles, header=True, tableStyleKey="table_header")
cellPs = self._createTableCellParagraphStyle(styles, header=False, tableStyleKey="table_cell")
def _cellPara(val, ps):
return self._paragraphFromInlineMarkdown(str(val) if val is not None else "", ps)
headerRow = [_cellPara(h, hdrPs) for h in headers]
bodyRows = []
for row in rows:
padded = list(row) + [""] * max(0, numCols - len(row))
padded = padded[:numCols]
bodyRows.append([_cellPara(c, cellPs) for c in padded])
table_matrix = [headerRow] + bodyRows
table = Table(table_matrix, colWidths=colWidths, repeatRows=1)
table_header_style = styles.get("table_header", {})
table_cell_style = styles.get("table_cell", {})
table_style = [
("BACKGROUND", (0, 0), (-1, 0), self._hexToColor(table_header_style.get("background", "#4F4F4F"))),
("BACKGROUND", (0, 1), (-1, -1), self._hexToColor(table_cell_style.get("background", "#FFFFFF"))),
("ALIGN", (0, 0), (-1, -1), "LEFT"),
("VALIGN", (0, 0), (-1, -1), "TOP"),
("LEFTPADDING", (0, 0), (-1, -1), 4),
("RIGHTPADDING", (0, 0), (-1, -1), 4),
("TOPPADDING", (0, 0), (-1, 0), 6),
("BOTTOMPADDING", (0, 0), (-1, 0), 8),
("TOPPADDING", (0, 1), (-1, -1), 4),
("BOTTOMPADDING", (0, 1), (-1, -1), 4),
("GRID", (0, 0), (-1, -1), 0.5, colors.black),
]
table.setStyle(TableStyle(table_style))
return [table, Spacer(1, 12)]
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return []
def _renderJsonBulletList(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON bullet list to PDF elements using AI-generated styles."""
try:
# Extract from nested content structure
content = list_data.get("content", {})
if not isinstance(content, dict):
return []
items = content.get("items", [])
bullet_style_def = styles.get("bullet_list", {})
elements = []
for item in items:
if isinstance(item, str):
elements.append(
Paragraph(f"{self._markdownInlineToReportlabXml(item)}", self._createNormalStyle(styles))
)
elif isinstance(item, dict) and "text" in item:
elements.append(
Paragraph(
f"{self._markdownInlineToReportlabXml(item['text'])}",
self._createNormalStyle(styles),
)
)
if elements:
elements.append(Spacer(1, bullet_style_def.get("space_after", 3)))
return elements
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
return []
def _renderJsonHeading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON heading to PDF elements using AI-generated styles."""
try:
# Extract from nested content structure
content = heading_data.get("content", {})
if not isinstance(content, dict):
return []
text = content.get("text", "")
level = content.get("level", 1)
if text:
level = max(1, min(6, level))
heading_style = self._createHeadingStyle(styles, level)
return [self._paragraphFromInlineMarkdown(text, heading_style)]
return []
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
return []
def _renderJsonParagraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON paragraph to PDF elements using AI-generated styles."""
try:
# Extract from nested content structure
content = paragraph_data.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
return [self._paragraphFromInlineMarkdown(text, self._createNormalStyle(styles))]
return []
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return []
def _renderJsonCodeBlock(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON code block to PDF elements using AI-generated styles."""
try:
# Extract from nested content structure
content = code_data.get("content", {})
if not isinstance(content, dict):
return []
code = content.get("code", "")
language = content.get("language", "")
code_style_def = styles.get("code_block", {})
if code:
code = _prepareCodeBlockPlainText(code)
code = _normalizePdfMonospaceText(code)
elements = []
fs = code_style_def.get("font_size", 9)
mono = code_style_def.get("font", "Courier")
if language:
lang_style = ParagraphStyle(
"CodeLanguage",
fontSize=fs,
textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")),
fontName="Helvetica-Bold",
alignment=TA_LEFT,
)
elements.append(
Paragraph(
self._escapeReportlabXml(f"Code ({language}):"),
lang_style,
)
)
approxCharWPt = max(fs * 0.52, 4.5)
usableWidth = _PDF_CONTENT_WIDTH_PT - 16 # left+right padding
maxLineChars = max(48, int(usableWidth / approxCharWPt))
bg_col = self._hexToColor(code_style_def.get("background", "#F5F5F5"))
leading = fs * 1.2
spaceAfter = code_style_def.get("space_after", 6)
# Each source line may wrap to ceil(len/maxLineChars) visual lines.
# Frame height ~740pt minus padding → keep rendered height < 600pt.
maxVisualLinesPerChunk = max(8, int(600 / leading))
srcLines = code.split("\n")
chunks: List[List[str]] = []
curChunk: List[str] = []
curVisual = 0
for sl in srcLines:
wrapped = max(1, -(-len(sl) // maxLineChars)) if sl else 1
if curVisual + wrapped > maxVisualLinesPerChunk and curChunk:
chunks.append(curChunk)
curChunk = []
curVisual = 0
curChunk.append(sl)
curVisual += wrapped
if curChunk:
chunks.append(curChunk)
for ci, chunkLines in enumerate(chunks):
chunkText = "\n".join(chunkLines)
styleId = f"CodePre_{id(code_data) & 0xFFFFFFFF}_{ci}"
codePrStyle = ParagraphStyle(
styleId,
fontName=mono,
fontSize=fs,
leading=leading,
textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")),
alignment=TA_LEFT,
leftIndent=0,
rightIndent=0,
)
pf = Preformatted(chunkText, codePrStyle, dedent=0, maxLineLength=maxLineChars)
tbl = Table([[pf]], colWidths=[_PDF_CONTENT_WIDTH_PT])
tbl.setStyle(
TableStyle(
[
("BACKGROUND", (0, 0), (-1, -1), bg_col),
("VALIGN", (0, 0), (-1, -1), "TOP"),
("LEFTPADDING", (0, 0), (-1, -1), 8),
("RIGHTPADDING", (0, 0), (-1, -1), 8),
("TOPPADDING", (0, 0), (-1, -1), 6),
("BOTTOMPADDING", (0, 0), (-1, -1), 6),
]
)
)
tbl.spaceAfter = 0 if ci < len(chunks) - 1 else spaceAfter
elements.append(tbl)
return elements
return []
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
return []
def _renderJsonImage(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON image to PDF elements using reportlab."""
try:
# Extract from nested content structure
content = image_data.get("content", {})
base64_data = ""
alt_text = "Image"
caption = ""
if isinstance(content, dict):
# Nested content structure
base64_data = content.get("base64Data", "")
alt_text = content.get("altText", "Image")
caption = content.get("caption", "")
elif isinstance(content, str):
# Content might be base64 string directly (shouldn't happen, but handle it)
self.logger.warning("Image content is a string, not a dict. This should not happen.")
return [Paragraph(f"[Image: Invalid format]", self._createNormalStyle(styles))]
# If base64Data not found in content, try direct element fields (fallback)
if not base64_data:
base64_data = image_data.get("base64Data", "")
if not alt_text or alt_text == "Image":
alt_text = image_data.get("altText", "Image")
if not caption:
caption = image_data.get("caption", "")
# If base64Data still not found, try extracting from url data URI
if not base64_data:
url = image_data.get("url", "") or (content.get("url", "") if isinstance(content, dict) else "")
if url and isinstance(url, str) and url.startswith("data:image/"):
# Extract base64 from data URI: data:image/png;base64,<base64>
import re
match = re.match(r'data:image/[^;]+;base64,(.+)', url)
if match:
base64_data = match.group(1)
if not base64_data:
self.logger.warning(f"No base64 data found for image. Alt text: {alt_text}")
return [Paragraph(f"[Image: {alt_text}]", self._createNormalStyle(styles))]
# Validate that base64_data is actually base64 (not the entire element rendered as text)
if len(base64_data) > 10000: # Very long string might be entire element JSON
self.logger.warning(f"Base64 data seems too long ({len(base64_data)} chars), might be incorrectly extracted")
# Ensure base64_data is a string, not bytes or other type
if not isinstance(base64_data, str):
self.logger.warning(f"Base64 data is not a string: {type(base64_data)}")
return [Paragraph(f"[Image: {alt_text} - Invalid data type]", self._createNormalStyle(styles))]
try:
from reportlab.platypus import Image as ReportLabImage
from reportlab.lib.units import inch
import base64
import io
# Decode base64 image data
imageBytes = base64.b64decode(base64_data)
imageStream = io.BytesIO(imageBytes)
# Create reportlab Image element
# Try to get image dimensions from PIL
try:
from PIL import Image as PILImage
from reportlab.lib.pagesizes import A4
pilImage = PILImage.open(imageStream)
originalWidth, originalHeight = pilImage.size
# Calculate available page dimensions (A4 with margins: 72pt left/right, 72pt top, 18pt bottom)
pageWidth = A4[0] # 595.27 points
pageHeight = A4[1] # 841.89 points
leftMargin = 72
rightMargin = 72
topMargin = 72
bottomMargin = 18
# Use actual frame dimensions from SimpleDocTemplate
# Frame is smaller than page minus margins due to internal spacing
# From error message: frame is 439.27559055118115 x 739.8897637795277
# Use conservative values with safety margin
availableWidth = 430.0 # Slightly smaller than frame width for safety
availableHeight = 730.0 # Slightly smaller than frame height for safety
# Convert original image size from pixels to points
# PIL provides size in pixels, need to convert to points
# Standard conversion: 1 inch = 72 points, typical screen DPI = 96 pixels/inch
# So: pixels * (72/96) = points, or pixels * 0.75 = points
# But for images, we should use the image's actual DPI if available
dpi = pilImage.info.get('dpi', (96, 96))[0] # Default to 96 DPI if not specified
if dpi <= 0:
dpi = 96 # Fallback to 96 DPI
# Convert pixels to points: 1 point = 1/72 inch, so pixels * (72/dpi) = points
imgWidthPoints = originalWidth * (72.0 / dpi)
imgHeightPoints = originalHeight * (72.0 / dpi)
# Scale to fit within available page dimensions while maintaining aspect ratio
widthScale = availableWidth / imgWidthPoints if imgWidthPoints > 0 else 1.0
heightScale = availableHeight / imgHeightPoints if imgHeightPoints > 0 else 1.0
# Use the smaller scale to ensure image fits both width and height
scale = min(widthScale, heightScale, 1.0) # Don't scale up, only down
imgWidth = imgWidthPoints * scale
imgHeight = imgHeightPoints * scale
# Additional safety check: ensure dimensions don't exceed available space
if imgWidth > availableWidth:
scale = availableWidth / imgWidth
imgWidth = availableWidth
imgHeight = imgHeight * scale
if imgHeight > availableHeight:
scale = availableHeight / imgHeight
imgHeight = availableHeight
imgWidth = imgWidth * scale
# Reset stream for reportlab
imageStream.seek(0)
except Exception as e:
# Fallback: use default size that fits page
self.logger.warning(f"Error calculating image size: {str(e)}, using safe default")
# Use 80% of available width as safe default
imgWidth = 4 * inch # ~288 points, safe for ~451pt available width
imgHeight = 3 * inch # ~216 points, safe for ~751pt available height
imageStream.seek(0)
# Create reportlab Image
reportlabImage = ReportLabImage(imageStream, width=imgWidth, height=imgHeight)
elements = [reportlabImage]
# Add caption if available
if caption:
captionStyle = self._createNormalStyle(styles)
captionStyle.fontSize = 10
captionStyle.textColor = self._hexToColor(styles.get("paragraph", {}).get("color", "#666666"))
elements.append(Paragraph(f"<i>{caption}</i>", captionStyle))
elif alt_text and alt_text != "Image":
# Use alt text as caption if no caption provided, but avoid usageHint format
if "Render as visual element:" in alt_text:
# Extract filename from usageHint if possible
parts = alt_text.split("Render as visual element:")
if len(parts) > 1:
filename = parts[1].strip()
caption_text = f"Figure: {filename}"
else:
caption_text = alt_text
else:
caption_text = f"Figure: {alt_text}"
captionStyle = self._createNormalStyle(styles)
captionStyle.fontSize = 10
captionStyle.textColor = self._hexToColor(styles.get("paragraph", {}).get("color", "#666666"))
elements.append(Paragraph(f"<i>{caption_text}</i>", captionStyle))
return elements
except Exception as imgError:
self.logger.error(f"Error embedding image in PDF: {str(imgError)}")
# Return error message instead of placeholder
errorStyle = self._createNormalStyle(styles)
errorStyle.textColor = self._hexToColor("#FF0000") # Red color for error
errorMsg = f"[Error: Could not embed image '{alt_text}'. {str(imgError)}]"
return [Paragraph(errorMsg, errorStyle)]
except Exception as e:
self.logger.error(f"Error rendering image: {str(e)}")
errorStyle = self._createNormalStyle(styles)
errorStyle.textColor = self._hexToColor("#FF0000") # Red color for error
errorMsg = f"[Error: Could not render image '{image_data.get('altText', 'Image')}'. {str(e)}]"
return [Paragraph(errorMsg, errorStyle)]