# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ PDF renderer for report generation using reportlab. """ from __future__ import annotations import unicodedata from .documentRendererBaseTemplate import BaseRenderer from modules.datamodels.datamodelDocument import RenderedDocument from typing import Dict, Any, List, Optional import io import base64 try: from reportlab.lib.pagesizes import A4 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Preformatted from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.units import inch from reportlab.lib import colors from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY REPORTLAB_AVAILABLE = True except ImportError: REPORTLAB_AVAILABLE = False import re as _re_pdf # A4 width in pt; margins must match SimpleDocTemplate(leftMargin/rightMargin) _PDF_MARGIN_LR_PT = 72.0 _PDF_A4_WIDTH_PT = 595.27 _PDF_CONTENT_WIDTH_PT = _PDF_A4_WIDTH_PT - (2 * _PDF_MARGIN_LR_PT) def _boxDrawingCharToAscii(ch: str) -> str: """Map one box-drawing character to ASCII (Courier has no glyphs for U+2500–U+257F).""" nm = unicodedata.name(ch, "") v = "VERTICAL" in nm h = "HORIZONTAL" in nm and_ = "AND" in nm if v and h: return "+" if v and not h and not and_: return "|" if h and not v and not and_: return "-" return "+" def _normalizePdfMonospaceText(text: str) -> str: """Replace Unicode box/block drawing with ASCII so PDF core fonts render readable code/trees.""" if not text: return "" out: List[str] = [] for ch in text: o = ord(ch) if 0x2500 <= o <= 0x257F: out.append(_boxDrawingCharToAscii(ch)) elif 0x2580 <= o <= 0x259F: out.append("#") else: out.append(ch) return "".join(out) def _prepareCodeBlockPlainText(text: str) -> str: """Normalize newlines/tabs for preformatted code (no HTML/XML; spaces must stay significant).""" if not text: return "" text = text.replace("\r\n", "\n").replace("\r", "\n") return text.expandtabs(4) class RendererPdf(BaseRenderer): """Renders content to PDF format using reportlab.""" @classmethod def getSupportedFormats(cls) -> List[str]: """Return supported PDF formats.""" return ['pdf'] @classmethod def getFormatAliases(cls) -> List[str]: """Return format aliases.""" return ['document', 'print'] @classmethod def getPriority(cls) -> int: """Return priority for PDF renderer.""" return 120 @classmethod def getOutputStyle(cls, formatName: Optional[str] = None) -> str: """Return output style classification: PDF documents are formatted documents.""" return 'document' @classmethod def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: """ Return list of section content types that PDF renderer accepts. PDF renderer accepts all section types (PDF documents can contain all content types). """ from modules.datamodels.datamodelJson import supportedSectionTypes return list(supportedSectionTypes) async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """Render extracted JSON content to PDF format using AI-analyzed styling.""" try: if not REPORTLAB_AVAILABLE: # Fallback to HTML if reportlab not available from .rendererHtml import RendererHtml html_renderer = RendererHtml() return await html_renderer.render(extractedContent, title, userPrompt, aiService) # Generate PDF using AI-analyzed styling pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService) # Extract metadata for document type and other info metadata = extractedContent.get("metadata", {}) if extractedContent else {} documentType = metadata.get("documentType") if isinstance(metadata, dict) else None # Determine filename from document or title documents = extractedContent.get("documents", []) if documents and isinstance(documents[0], dict): filename = documents[0].get("filename") if not filename: filename = self._determineFilename(title, "application/pdf") else: filename = self._determineFilename(title, "application/pdf") # Convert PDF content to bytes if it's a string (base64) if isinstance(pdf_content, str): # Try to decode as base64, otherwise encode as UTF-8 try: pdf_bytes = base64.b64decode(pdf_content) except Exception: pdf_bytes = pdf_content.encode('utf-8') else: pdf_bytes = pdf_content return [ RenderedDocument( documentData=pdf_bytes, mimeType="application/pdf", filename=filename, documentType=documentType, metadata=metadata if isinstance(metadata, dict) else None ) ] except Exception as e: self.logger.error(f"Error rendering PDF: {str(e)}") # Return minimal fallback fallbackContent = f"PDF Generation Error: {str(e)}" return [ RenderedDocument( documentData=fallbackContent.encode('utf-8'), mimeType="text/plain", filename=self._determineFilename(title, "text/plain") ) ] async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str: """Generate PDF content from structured JSON document using AI-generated styling.""" try: # Get style set: use styles from metadata if available, otherwise enhance with AI styles = await self._getStyleSet(json_content, userPrompt, aiService) # Validate JSON structure if not self._validateJsonStructure(json_content): raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}") # Extract sections and metadata from standardized schema sections = self._extractSections(json_content) # Create a buffer to hold the PDF buffer = io.BytesIO() # Create PDF document doc = SimpleDocTemplate( buffer, pagesize=A4, rightMargin=72, leftMargin=72, topMargin=72, bottomMargin=18 ) # Build PDF content (no cover page — body starts on page 1; filename still uses `title`) story = [] # Process each section (sections already extracted above) self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER") for i, section in enumerate(sections): self.services.utils.debugLogToFile(f"PDF SECTION {i}: content_type={section.get('content_type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER") section_elements = self._renderJsonSection(section, styles) self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER") story.extend(section_elements) # Build PDF — retry with oversized flowables removed on LayoutError self._buildPdfWithOverflowGuard(doc, story, buffer) buffer.seek(0) pdf_bytes = buffer.getvalue() pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') return pdf_base64 except Exception as e: self.logger.error(f"Error generating PDF from JSON: {str(e)}") raise Exception(f"PDF generation failed: {str(e)}") def _buildPdfWithOverflowGuard(self, doc, story: List[Any], buffer) -> None: """Try doc.build(); on 'too large on page' LayoutError, drop the offending flowable, log a warning, and retry (up to 5 times).""" maxRetries = 5 for attempt in range(maxRetries + 1): try: buffer.seek(0) buffer.truncate() doc.build(story) return except Exception as e: msg = str(e) if "too large on page" not in msg or attempt == maxRetries: raise # Identify the offending flowable from the error repr self.logger.warning(f"PDF overflow (attempt {attempt + 1}): {msg} — removing oversized element and retrying") removed = False for idx, flowable in enumerate(story): fRepr = repr(flowable) if "Table" in fRepr and hasattr(flowable, '_cellvalues'): try: nRows = len(flowable._cellvalues) nCols = len(flowable._cellvalues[0]) if flowable._cellvalues else 0 if nRows == 1 and nCols == 1: errPara = Paragraph( "[Code block omitted — content too large for PDF page]", self._createNormalStyle({}), ) story[idx] = errPara removed = True break except Exception: pass if not removed: raise async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]: """Get style set - use styles from document generation metadata if available, otherwise enhance default styles with AI if userPrompt provided. WICHTIG: In a dynamic scalable AI system, styling should come from document generation, not be generated separately by renderers. Only fall back to AI if styles not provided. Args: extractedContent: Document content with metadata (may contain styles) userPrompt: User's prompt (AI will detect style instructions in any language) aiService: AI service (used only if styles not in metadata and userPrompt provided) templateName: Name of template style set (None = default) Returns: Dict with style definitions for all document styles """ # Get default style set defaultStyleSet = self._getDefaultStyleSet() # FIRST: Check if styles are provided in document generation metadata (preferred approach) if extractedContent: metadata = extractedContent.get("metadata", {}) if isinstance(metadata, dict): styles = metadata.get("styles") if styles and isinstance(styles, dict): self.logger.debug("Using styles from document generation metadata") enhancedStyleSet = self._convertColorsFormat(styles) return self._validateStylesContrast(enhancedStyleSet) # FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata) if userPrompt and aiService: self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...") enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService) # Convert colors to PDF format after getting styles enhancedStyleSet = self._convertColorsFormat(enhancedStyleSet) return self._validateStylesContrast(enhancedStyleSet) else: # Use default styles only return defaultStyleSet async def _enhanceStylesWithAI(self, userPrompt: str, defaultStyleSet: Dict[str, Any], aiService) -> Dict[str, Any]: """Enhance default styles with AI based on user prompt.""" try: style_template = self._createAiStyleTemplate("pdf", userPrompt, defaultStyleSet) enhanced_styles = await self._getAiStyles(aiService, style_template, defaultStyleSet) return enhanced_styles except Exception as e: self.logger.warning(f"AI style enhancement failed: {str(e)}, using default styles") return defaultStyleSet def _validateStylesContrast(self, styles: Dict[str, Any]) -> Dict[str, Any]: """Validate and fix contrast issues in AI-generated styles.""" try: # Fix table header contrast if "table_header" in styles: header = styles["table_header"] bg_color = header.get("background", "#FFFFFF") text_color = header.get("text_color", "#000000") # If both are white or both are dark, fix it if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": header["background"] = "#4F4F4F" header["text_color"] = "#FFFFFF" elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": header["background"] = "#4F4F4F" header["text_color"] = "#FFFFFF" # Fix table cell contrast if "table_cell" in styles: cell = styles["table_cell"] bg_color = cell.get("background", "#FFFFFF") text_color = cell.get("text_color", "#000000") # If both are white or both are dark, fix it if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": cell["background"] = "#FFFFFF" cell["text_color"] = "#2F2F2F" elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": cell["background"] = "#FFFFFF" cell["text_color"] = "#2F2F2F" return styles except Exception as e: self.logger.warning(f"Style validation failed: {str(e)}") return self._getDefaultStyleSet() def _getDefaultStyleSet(self) -> Dict[str, Any]: """Default PDF style set - used when no style instructions present.""" return { "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30}, # Markdown #..###### — sizes must strictly decrease (H1 largest … H6 smallest). "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12}, "heading2": {"font_size": 15, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 10, "space_before": 10}, "heading3": {"font_size": 13, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8}, "heading4": {"font_size": 12, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 6, "space_before": 6}, "heading5": {"font_size": 11, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 6, "space_before": 6}, "heading6": {"font_size": 10, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 4, "space_before": 4}, "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2}, "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "left", "font_size": 12}, "table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10}, "bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3}, "code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6, "align": "left"} } async def _getAiStylesWithPdfColors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]: """Get AI styles with proper PDF color conversion.""" if not ai_service: return default_styles try: from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum request_options = AiCallOptions() request_options.operationType = OperationTypeEnum.DATA_GENERATE request = AiCallRequest(prompt=style_template, context="", options=request_options) # Check if AI service is properly configured if not hasattr(ai_service, 'aiObjects') or not ai_service.aiObjects: self.logger.warning("AI service not properly configured, using defaults") return default_styles response = await ai_service.callAi(request) # Check if response is valid if not response: self.logger.warning("AI service returned no response, using defaults") return default_styles import json import re # Clean and parse JSON result = response.content.strip() if response and response.content else "" # Check if result is empty if not result: self.logger.warning("AI styling returned empty response, using defaults") return default_styles # Log the raw response for debugging self.logger.debug(f"AI styling raw response: {result[:200]}...") # Extract JSON from various formats json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) if json_match: result = json_match.group(1).strip() elif result.startswith('```json'): result = re.sub(r'^```json\s*', '', result) result = re.sub(r'\s*```$', '', result) elif result.startswith('```'): result = re.sub(r'^```\s*', '', result) result = re.sub(r'\s*```$', '', result) # Try to extract JSON from explanatory text json_patterns = [ r'\{[^{}]*"title"[^{}]*\}', # Simple JSON object r'\{.*?"title".*?\}', # JSON with title field r'\{.*?"font_size".*?\}', # JSON with font_size field ] for pattern in json_patterns: json_match = re.search(pattern, result, re.DOTALL) if json_match: result = json_match.group(0) break # Additional cleanup - remove any leading/trailing whitespace and newlines result = result.strip() # Check if result is still empty after cleanup if not result: self.logger.warning("AI styling returned empty content after cleanup, using defaults") return default_styles # Try to parse JSON try: styles = json.loads(result) self.logger.debug(f"Successfully parsed AI styles: {list(styles.keys())}") except json.JSONDecodeError as json_error: self.logger.warning(f"AI styling returned invalid JSON: {json_error}") # Use print instead of logger to avoid truncation self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "PDF_RENDERER") self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "PDF_RENDERER") self.logger.warning(f"Raw content that failed to parse: {result}") # Try to fix incomplete JSON by adding missing closing braces open_braces = result.count('{') close_braces = result.count('}') if open_braces > close_braces: # JSON is incomplete, add missing closing braces missing_braces = open_braces - close_braces result = result + '}' * missing_braces self.logger.info(f"Added {missing_braces} missing closing brace(s)") # Try parsing the fixed JSON try: styles = json.loads(result) self.logger.info("Successfully fixed incomplete JSON") except json.JSONDecodeError as fix_error: self.logger.warning(f"Fixed JSON still invalid: {fix_error}") # Try to extract just the JSON part if it's embedded in text json_start = result.find('{') json_end = result.rfind('}') if json_start != -1 and json_end != -1 and json_end > json_start: json_part = result[json_start:json_end+1] try: styles = json.loads(json_part) self.logger.info("Successfully extracted JSON from explanatory text") except json.JSONDecodeError: self.logger.warning("Could not extract valid JSON from response, using defaults") return default_styles else: return default_styles else: # Try to extract just the JSON part if it's embedded in text json_start = result.find('{') json_end = result.rfind('}') if json_start != -1 and json_end != -1 and json_end > json_start: json_part = result[json_start:json_end+1] try: styles = json.loads(json_part) self.logger.info("Successfully extracted JSON from explanatory text") except json.JSONDecodeError: self.logger.warning("Could not extract valid JSON from response, using defaults") return default_styles else: return default_styles # Convert colors to PDF format (keep as hex strings, PDF renderer will convert them) styles = self._convertColorsFormat(styles) return styles except Exception as e: self.logger.warning(f"AI styling failed: {str(e)}, using defaults") return default_styles def _convertColorsFormat(self, styles: Dict[str, Any]) -> Dict[str, Any]: """Convert colors to proper format for PDF compatibility.""" try: for style_name, style_config in styles.items(): if isinstance(style_config, dict): for prop, value in style_config.items(): if isinstance(value, str) and value.startswith('#') and len(value) == 7: # Convert #RRGGBB to #AARRGGBB (add FF alpha channel) for consistency styles[style_name][prop] = f"FF{value[1:]}" elif isinstance(value, str) and value.startswith('#') and len(value) == 9: # Already aRGB format, keep as is pass return styles except Exception as e: self.logger.warning(f"Color conversion failed: {str(e)}") return styles def _getSafeColor(self, color_value: str, default: str = "#000000") -> str: """Get a safe hex color value for PDF.""" if isinstance(color_value, str) and color_value.startswith('#'): if len(color_value) == 7: return f"FF{color_value[1:]}" elif len(color_value) == 9: return color_value return default def _defaultHeadingStyleDef(self, level: int) -> Dict[str, Any]: """When heading{N} is missing from styles, never fall back to heading1 (that made H3 > H2).""" sizes = {1: 18, 2: 15, 3: 13, 4: 12, 5: 11, 6: 10} fs = sizes.get(level, 10) sb = max(4, 14 - level) return { "font_size": fs, "color": "#2F2F2F" if level <= 2 else "#4F4F4F", "bold": True, "align": "left", "space_after": sb, "space_before": sb, } def _createHeadingStyle(self, styles: Dict[str, Any], level: int) -> ParagraphStyle: """Create heading style from style definitions.""" heading_key = f"heading{level}" heading_style_def = styles.get(heading_key) or self._defaultHeadingStyleDef(level) fs = heading_style_def.get("font_size", self._defaultHeadingStyleDef(level)["font_size"]) bold = heading_style_def.get("bold", True) return ParagraphStyle( f'CustomHeading{level}', fontName="Helvetica-Bold" if bold else "Helvetica", fontSize=fs, spaceAfter=heading_style_def.get("space_after", 12), spaceBefore=heading_style_def.get("space_before", 12), alignment=self._getAlignment(heading_style_def.get("align", "left")), textColor=self._hexToColor(heading_style_def.get("color", "#2F2F2F")), leading=fs * 1.35, ) def _createNormalStyle(self, styles: Dict[str, Any]) -> ParagraphStyle: """Create normal paragraph style from style definitions.""" paragraph_style_def = styles.get("paragraph", {}) return ParagraphStyle( 'CustomNormal', fontSize=paragraph_style_def.get("font_size", 11), spaceAfter=paragraph_style_def.get("space_after", 6), alignment=self._getAlignment(paragraph_style_def.get("align", "left")), textColor=self._hexToColor(paragraph_style_def.get("color", "#2F2F2F")), leading=paragraph_style_def.get("line_height", 1.2) * paragraph_style_def.get("font_size", 11) ) def _getAlignment(self, align: str) -> int: """Convert alignment string to reportlab alignment constant.""" if not align or not isinstance(align, str): return TA_LEFT align_map = { "center": TA_CENTER, "left": TA_LEFT, "justify": TA_JUSTIFY, "right": TA_LEFT, # ReportLab doesn't have TA_RIGHT, use LEFT as fallback "0": TA_LEFT, # Handle numeric strings "1": TA_CENTER, "2": TA_JUSTIFY } return align_map.get(align.lower().strip(), TA_LEFT) def _hexToColor(self, hex_color: str) -> colors.Color: """Convert hex color to reportlab color.""" try: hex_color = hex_color.lstrip('#') # Handle aRGB format (8 characters: FF + RGB) if len(hex_color) == 8: # Skip the alpha channel (first 2 characters) hex_color = hex_color[2:] # Handle RGB format (6 characters) if len(hex_color) == 6: r = int(hex_color[0:2], 16) / 255.0 g = int(hex_color[2:4], 16) / 255.0 b = int(hex_color[4:6], 16) / 255.0 return colors.Color(r, g, b) # Fallback for other formats return colors.black except: return colors.black def _escapeReportlabXml(self, text: str) -> str: """Escape text for ReportLab Paragraph markup.""" if not text: return "" return ( text.replace("&", "&") .replace("<", "<") .replace(">", ">") ) def _applyInlineMarkdownToEscapedPlain(self, text: str) -> str: """Escape XML then apply bold/italic to a segment with no `code` spans (code is handled separately).""" if not text: return "" s = self._escapeReportlabXml(text) s = _re_pdf.sub(r"\*\*(.+?)\*\*", r"\1", s, flags=_re_pdf.DOTALL) s = _re_pdf.sub(r"__(.+?)__", r"\1", s, flags=_re_pdf.DOTALL) s = _re_pdf.sub(r"(?\1", s) s = _re_pdf.sub(r"(?\1", s) return s def _markdownInlineToReportlabXml(self, text: str) -> str: """Turn common markdown inline (**bold**, *italic*, `code`) into ReportLab XML. Backtick spans are extracted first so paths like `...//...` are not corrupted by markdown patterns and XML escaping stays well-formed inside . """ if not text: return "" text = _normalizePdfMonospaceText(text) out: List[str] = [] pos = 0 for m in _re_pdf.finditer(r"`([^`]*)`", text): before = text[pos:m.start()] out.append(self._applyInlineMarkdownToEscapedPlain(before)) code = m.group(1) out.append(f'{self._escapeReportlabXml(code)}') pos = m.end() out.append(self._applyInlineMarkdownToEscapedPlain(text[pos:])) return "".join(out) def _paragraphFromInlineMarkdown(self, text: str, style: ParagraphStyle) -> Paragraph: return Paragraph(self._markdownInlineToReportlabXml(text), style) def _createTableCellParagraphStyle( self, styles: Dict[str, Any], *, header: bool, tableStyleKey: str ) -> ParagraphStyle: """Paragraph style for table cells (word wrap within colWidth).""" tdef = styles.get(tableStyleKey, {}) fs = tdef.get("font_size", 12 if header else 10) defaultTc = "#FFFFFF" if header else "#2F2F2F" return ParagraphStyle( f"TblCell{'H' if header else 'B'}{tableStyleKey}", fontSize=fs, leading=fs * 1.25, alignment=TA_LEFT, textColor=self._hexToColor(tdef.get("text_color", defaultTc)), fontName="Helvetica-Bold" if header and tdef.get("bold", True) else "Helvetica", ) def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a single JSON section to PDF elements using AI-generated styles. Supports three content formats: reference, object (base64), extracted_text. """ try: section_type = self._getSectionType(section) elements = self._getSectionData(section) # Process each element in the section all_elements = [] for element in elements: element_type = element.get("type", "") if isinstance(element, dict) else "" # Support three content formats from Phase 5D if element_type == "reference": # Document reference format doc_ref = element.get("documentReference", "") label = element.get("label", "Reference") ref_style = ParagraphStyle( 'Reference', parent=self._createNormalStyle(styles), fontStyle='italic', textColor=colors.grey ) all_elements.append(Paragraph(f"[Reference: {label}]", ref_style)) all_elements.append(Spacer(1, 6)) continue elif element_type == "extracted_text": # Extracted text format content = element.get("content", "") source = element.get("source", "") if content: bodyXml = self._markdownInlineToReportlabXml(content) if source: bodyXml = f"{bodyXml} (Source: {self._escapeReportlabXml(source)})" all_elements.append(Paragraph(bodyXml, self._createNormalStyle(styles))) all_elements.append(Spacer(1, 6)) continue # Check element type, not section type (elements can have different types than section) if element_type == "table": all_elements.extend(self._renderJsonTable(element, styles)) elif element_type == "bullet_list": all_elements.extend(self._renderJsonBulletList(element, styles)) elif element_type == "heading": all_elements.extend(self._renderJsonHeading(element, styles)) elif element_type == "paragraph": all_elements.extend(self._renderJsonParagraph(element, styles)) elif element_type == "code_block": all_elements.extend(self._renderJsonCodeBlock(element, styles)) elif element_type == "image": all_elements.extend(self._renderJsonImage(element, styles)) else: # Fallback: if element_type not set, use section_type as fallback if section_type == "table": all_elements.extend(self._renderJsonTable(element, styles)) elif section_type == "bullet_list": all_elements.extend(self._renderJsonBulletList(element, styles)) elif section_type == "heading": all_elements.extend(self._renderJsonHeading(element, styles)) elif section_type == "paragraph": all_elements.extend(self._renderJsonParagraph(element, styles)) elif section_type == "code_block": all_elements.extend(self._renderJsonCodeBlock(element, styles)) elif section_type == "image": all_elements.extend(self._renderJsonImage(element, styles)) else: # Final fallback to paragraph for unknown types all_elements.extend(self._renderJsonParagraph(element, styles)) return all_elements except Exception as e: self.logger.warning(f"Error rendering section {self._getSectionId(section)}: {str(e)}") return [Paragraph(f"[Error rendering section: {str(e)}]", self._createNormalStyle(styles))] def _renderJsonTable(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a JSON table: left-aligned, width capped to printable area, cells wrap.""" try: content = table_data.get("content", {}) if not isinstance(content, dict): return [] headers = content.get("headers", []) rows = content.get("rows", []) if not headers or not rows: return [] numCols = len(headers) colWidth = _PDF_CONTENT_WIDTH_PT / max(numCols, 1) colWidths = [colWidth] * numCols hdrPs = self._createTableCellParagraphStyle(styles, header=True, tableStyleKey="table_header") cellPs = self._createTableCellParagraphStyle(styles, header=False, tableStyleKey="table_cell") def _cellPara(val, ps): return self._paragraphFromInlineMarkdown(str(val) if val is not None else "", ps) headerRow = [_cellPara(h, hdrPs) for h in headers] bodyRows = [] for row in rows: padded = list(row) + [""] * max(0, numCols - len(row)) padded = padded[:numCols] bodyRows.append([_cellPara(c, cellPs) for c in padded]) table_matrix = [headerRow] + bodyRows table = Table(table_matrix, colWidths=colWidths, repeatRows=1) table_header_style = styles.get("table_header", {}) table_cell_style = styles.get("table_cell", {}) table_style = [ ("BACKGROUND", (0, 0), (-1, 0), self._hexToColor(table_header_style.get("background", "#4F4F4F"))), ("BACKGROUND", (0, 1), (-1, -1), self._hexToColor(table_cell_style.get("background", "#FFFFFF"))), ("ALIGN", (0, 0), (-1, -1), "LEFT"), ("VALIGN", (0, 0), (-1, -1), "TOP"), ("LEFTPADDING", (0, 0), (-1, -1), 4), ("RIGHTPADDING", (0, 0), (-1, -1), 4), ("TOPPADDING", (0, 0), (-1, 0), 6), ("BOTTOMPADDING", (0, 0), (-1, 0), 8), ("TOPPADDING", (0, 1), (-1, -1), 4), ("BOTTOMPADDING", (0, 1), (-1, -1), 4), ("GRID", (0, 0), (-1, -1), 0.5, colors.black), ] table.setStyle(TableStyle(table_style)) return [table, Spacer(1, 12)] except Exception as e: self.logger.warning(f"Error rendering table: {str(e)}") return [] def _renderJsonBulletList(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a JSON bullet list to PDF elements using AI-generated styles.""" try: # Extract from nested content structure content = list_data.get("content", {}) if not isinstance(content, dict): return [] items = content.get("items", []) bullet_style_def = styles.get("bullet_list", {}) elements = [] for item in items: if isinstance(item, str): elements.append( Paragraph(f"• {self._markdownInlineToReportlabXml(item)}", self._createNormalStyle(styles)) ) elif isinstance(item, dict) and "text" in item: elements.append( Paragraph( f"• {self._markdownInlineToReportlabXml(item['text'])}", self._createNormalStyle(styles), ) ) if elements: elements.append(Spacer(1, bullet_style_def.get("space_after", 3))) return elements except Exception as e: self.logger.warning(f"Error rendering bullet list: {str(e)}") return [] def _renderJsonHeading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a JSON heading to PDF elements using AI-generated styles.""" try: # Extract from nested content structure content = heading_data.get("content", {}) if not isinstance(content, dict): return [] text = content.get("text", "") level = content.get("level", 1) if text: level = max(1, min(6, level)) heading_style = self._createHeadingStyle(styles, level) return [self._paragraphFromInlineMarkdown(text, heading_style)] return [] except Exception as e: self.logger.warning(f"Error rendering heading: {str(e)}") return [] def _renderJsonParagraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a JSON paragraph to PDF elements using AI-generated styles.""" try: # Extract from nested content structure content = paragraph_data.get("content", {}) if isinstance(content, dict): text = content.get("text", "") elif isinstance(content, str): text = content else: text = "" if text: return [self._paragraphFromInlineMarkdown(text, self._createNormalStyle(styles))] return [] except Exception as e: self.logger.warning(f"Error rendering paragraph: {str(e)}") return [] def _renderJsonCodeBlock(self, code_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a JSON code block to PDF elements using AI-generated styles.""" try: # Extract from nested content structure content = code_data.get("content", {}) if not isinstance(content, dict): return [] code = content.get("code", "") language = content.get("language", "") code_style_def = styles.get("code_block", {}) if code: code = _prepareCodeBlockPlainText(code) code = _normalizePdfMonospaceText(code) elements = [] fs = code_style_def.get("font_size", 9) mono = code_style_def.get("font", "Courier") if language: lang_style = ParagraphStyle( "CodeLanguage", fontSize=fs, textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")), fontName="Helvetica-Bold", alignment=TA_LEFT, ) elements.append( Paragraph( self._escapeReportlabXml(f"Code ({language}):"), lang_style, ) ) approxCharWPt = max(fs * 0.52, 4.5) usableWidth = _PDF_CONTENT_WIDTH_PT - 16 # left+right padding maxLineChars = max(48, int(usableWidth / approxCharWPt)) bg_col = self._hexToColor(code_style_def.get("background", "#F5F5F5")) leading = fs * 1.2 spaceAfter = code_style_def.get("space_after", 6) # Each source line may wrap to ceil(len/maxLineChars) visual lines. # Frame height ~740pt minus padding → keep rendered height < 600pt. maxVisualLinesPerChunk = max(8, int(600 / leading)) srcLines = code.split("\n") chunks: List[List[str]] = [] curChunk: List[str] = [] curVisual = 0 for sl in srcLines: wrapped = max(1, -(-len(sl) // maxLineChars)) if sl else 1 if curVisual + wrapped > maxVisualLinesPerChunk and curChunk: chunks.append(curChunk) curChunk = [] curVisual = 0 curChunk.append(sl) curVisual += wrapped if curChunk: chunks.append(curChunk) for ci, chunkLines in enumerate(chunks): chunkText = "\n".join(chunkLines) styleId = f"CodePre_{id(code_data) & 0xFFFFFFFF}_{ci}" codePrStyle = ParagraphStyle( styleId, fontName=mono, fontSize=fs, leading=leading, textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")), alignment=TA_LEFT, leftIndent=0, rightIndent=0, ) pf = Preformatted(chunkText, codePrStyle, dedent=0, maxLineLength=maxLineChars) tbl = Table([[pf]], colWidths=[_PDF_CONTENT_WIDTH_PT]) tbl.setStyle( TableStyle( [ ("BACKGROUND", (0, 0), (-1, -1), bg_col), ("VALIGN", (0, 0), (-1, -1), "TOP"), ("LEFTPADDING", (0, 0), (-1, -1), 8), ("RIGHTPADDING", (0, 0), (-1, -1), 8), ("TOPPADDING", (0, 0), (-1, -1), 6), ("BOTTOMPADDING", (0, 0), (-1, -1), 6), ] ) ) tbl.spaceAfter = 0 if ci < len(chunks) - 1 else spaceAfter elements.append(tbl) return elements return [] except Exception as e: self.logger.warning(f"Error rendering code block: {str(e)}") return [] def _renderJsonImage(self, image_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a JSON image to PDF elements using reportlab.""" try: # Extract from nested content structure content = image_data.get("content", {}) base64_data = "" alt_text = "Image" caption = "" if isinstance(content, dict): # Nested content structure base64_data = content.get("base64Data", "") alt_text = content.get("altText", "Image") caption = content.get("caption", "") elif isinstance(content, str): # Content might be base64 string directly (shouldn't happen, but handle it) self.logger.warning("Image content is a string, not a dict. This should not happen.") return [Paragraph(f"[Image: Invalid format]", self._createNormalStyle(styles))] # If base64Data not found in content, try direct element fields (fallback) if not base64_data: base64_data = image_data.get("base64Data", "") if not alt_text or alt_text == "Image": alt_text = image_data.get("altText", "Image") if not caption: caption = image_data.get("caption", "") # If base64Data still not found, try extracting from url data URI if not base64_data: url = image_data.get("url", "") or (content.get("url", "") if isinstance(content, dict) else "") if url and isinstance(url, str) and url.startswith("data:image/"): # Extract base64 from data URI: data:image/png;base64, import re match = re.match(r'data:image/[^;]+;base64,(.+)', url) if match: base64_data = match.group(1) if not base64_data: self.logger.warning(f"No base64 data found for image. Alt text: {alt_text}") return [Paragraph(f"[Image: {alt_text}]", self._createNormalStyle(styles))] # Validate that base64_data is actually base64 (not the entire element rendered as text) if len(base64_data) > 10000: # Very long string might be entire element JSON self.logger.warning(f"Base64 data seems too long ({len(base64_data)} chars), might be incorrectly extracted") # Ensure base64_data is a string, not bytes or other type if not isinstance(base64_data, str): self.logger.warning(f"Base64 data is not a string: {type(base64_data)}") return [Paragraph(f"[Image: {alt_text} - Invalid data type]", self._createNormalStyle(styles))] try: from reportlab.platypus import Image as ReportLabImage from reportlab.lib.units import inch import base64 import io # Decode base64 image data imageBytes = base64.b64decode(base64_data) imageStream = io.BytesIO(imageBytes) # Create reportlab Image element # Try to get image dimensions from PIL try: from PIL import Image as PILImage from reportlab.lib.pagesizes import A4 pilImage = PILImage.open(imageStream) originalWidth, originalHeight = pilImage.size # Calculate available page dimensions (A4 with margins: 72pt left/right, 72pt top, 18pt bottom) pageWidth = A4[0] # 595.27 points pageHeight = A4[1] # 841.89 points leftMargin = 72 rightMargin = 72 topMargin = 72 bottomMargin = 18 # Use actual frame dimensions from SimpleDocTemplate # Frame is smaller than page minus margins due to internal spacing # From error message: frame is 439.27559055118115 x 739.8897637795277 # Use conservative values with safety margin availableWidth = 430.0 # Slightly smaller than frame width for safety availableHeight = 730.0 # Slightly smaller than frame height for safety # Convert original image size from pixels to points # PIL provides size in pixels, need to convert to points # Standard conversion: 1 inch = 72 points, typical screen DPI = 96 pixels/inch # So: pixels * (72/96) = points, or pixels * 0.75 = points # But for images, we should use the image's actual DPI if available dpi = pilImage.info.get('dpi', (96, 96))[0] # Default to 96 DPI if not specified if dpi <= 0: dpi = 96 # Fallback to 96 DPI # Convert pixels to points: 1 point = 1/72 inch, so pixels * (72/dpi) = points imgWidthPoints = originalWidth * (72.0 / dpi) imgHeightPoints = originalHeight * (72.0 / dpi) # Scale to fit within available page dimensions while maintaining aspect ratio widthScale = availableWidth / imgWidthPoints if imgWidthPoints > 0 else 1.0 heightScale = availableHeight / imgHeightPoints if imgHeightPoints > 0 else 1.0 # Use the smaller scale to ensure image fits both width and height scale = min(widthScale, heightScale, 1.0) # Don't scale up, only down imgWidth = imgWidthPoints * scale imgHeight = imgHeightPoints * scale # Additional safety check: ensure dimensions don't exceed available space if imgWidth > availableWidth: scale = availableWidth / imgWidth imgWidth = availableWidth imgHeight = imgHeight * scale if imgHeight > availableHeight: scale = availableHeight / imgHeight imgHeight = availableHeight imgWidth = imgWidth * scale # Reset stream for reportlab imageStream.seek(0) except Exception as e: # Fallback: use default size that fits page self.logger.warning(f"Error calculating image size: {str(e)}, using safe default") # Use 80% of available width as safe default imgWidth = 4 * inch # ~288 points, safe for ~451pt available width imgHeight = 3 * inch # ~216 points, safe for ~751pt available height imageStream.seek(0) # Create reportlab Image reportlabImage = ReportLabImage(imageStream, width=imgWidth, height=imgHeight) elements = [reportlabImage] # Add caption if available if caption: captionStyle = self._createNormalStyle(styles) captionStyle.fontSize = 10 captionStyle.textColor = self._hexToColor(styles.get("paragraph", {}).get("color", "#666666")) elements.append(Paragraph(f"{caption}", captionStyle)) elif alt_text and alt_text != "Image": # Use alt text as caption if no caption provided, but avoid usageHint format if "Render as visual element:" in alt_text: # Extract filename from usageHint if possible parts = alt_text.split("Render as visual element:") if len(parts) > 1: filename = parts[1].strip() caption_text = f"Figure: {filename}" else: caption_text = alt_text else: caption_text = f"Figure: {alt_text}" captionStyle = self._createNormalStyle(styles) captionStyle.fontSize = 10 captionStyle.textColor = self._hexToColor(styles.get("paragraph", {}).get("color", "#666666")) elements.append(Paragraph(f"{caption_text}", captionStyle)) return elements except Exception as imgError: self.logger.error(f"Error embedding image in PDF: {str(imgError)}") # Return error message instead of placeholder errorStyle = self._createNormalStyle(styles) errorStyle.textColor = self._hexToColor("#FF0000") # Red color for error errorMsg = f"[Error: Could not embed image '{alt_text}'. {str(imgError)}]" return [Paragraph(errorMsg, errorStyle)] except Exception as e: self.logger.error(f"Error rendering image: {str(e)}") errorStyle = self._createNormalStyle(styles) errorStyle.textColor = self._hexToColor("#FF0000") # Red color for error errorMsg = f"[Error: Could not render image '{image_data.get('altText', 'Image')}'. {str(e)}]" return [Paragraph(errorMsg, errorStyle)]