From 9186c60ad210459529bcb81e635b34e79ed7e4e8 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sun, 22 Mar 2026 11:09:48 +0100 Subject: [PATCH] fixed rendering issues --- modules/datamodels/datamodelDataSource.py | 7 +- .../workspace/routeFeatureWorkspace.py | 33 +- modules/interfaces/interfaceAiObjects.py | 2 + .../services/serviceAgent/agentLoop.py | 40 +- .../services/serviceAgent/mainServiceAgent.py | 88 ++- .../services/serviceChat/mainServiceChat.py | 3 +- .../renderers/rendererDocx.py | 68 +- .../renderers/rendererPdf.py | 411 +++++++--- .../renderers/rendererPptx.py | 740 +++++++----------- .../unit/services/test_renderer_pdf_smoke.py | 253 ++++++ 10 files changed, 1011 insertions(+), 634 deletions(-) create mode 100644 tests/unit/services/test_renderer_pdf_smoke.py diff --git a/modules/datamodels/datamodelDataSource.py b/modules/datamodels/datamodelDataSource.py index 86e0c7ec..f8238fab 100644 --- a/modules/datamodels/datamodelDataSource.py +++ b/modules/datamodels/datamodelDataSource.py @@ -19,7 +19,11 @@ class DataSource(BaseModel): connectionId: str = Field(description="FK to UserConnection") sourceType: str = Field(description="sharepointFolder, googleDriveFolder, outlookFolder, ftpFolder") path: str = Field(description="External path (e.g. '/sites/MySite/Documents/Reports')") - label: str = Field(description="User-visible label") + label: str = Field(description="User-visible label (often the last path segment)") + displayPath: Optional[str] = Field( + default=None, + description="Human-readable full path for UI (connection-relative, slash-separated)", + ) featureInstanceId: Optional[str] = Field(default=None, description="Scoped to feature instance") mandateId: Optional[str] = Field(default=None, description="Mandate scope") userId: str = Field(default="", description="Owner user ID") @@ -37,6 +41,7 @@ registerModelLabels( "sourceType": {"en": "Source Type", "de": "Quellentyp", "fr": "Type de source"}, "path": {"en": "Path", "de": "Pfad", "fr": "Chemin"}, "label": {"en": "Label", "de": "Bezeichnung", "fr": "Libellé"}, + "displayPath": {"en": "Display path", "de": "Anzeigepfad", "fr": "Chemin affiché"}, "featureInstanceId": {"en": "Feature Instance", "de": "Feature-Instanz", "fr": "Instance de fonctionnalité"}, "mandateId": {"en": "Mandate ID", "de": "Mandanten-ID", "fr": "ID du mandat"}, "userId": {"en": "User ID", "de": "Benutzer-ID", "fr": "ID utilisateur"}, diff --git a/modules/features/workspace/routeFeatureWorkspace.py b/modules/features/workspace/routeFeatureWorkspace.py index cf8efc04..d0dd22da 100644 --- a/modules/features/workspace/routeFeatureWorkspace.py +++ b/modules/features/workspace/routeFeatureWorkspace.py @@ -1139,6 +1139,7 @@ class CreateDataSourceRequest(BaseModel): sourceType: str = Field(description="Source type") path: str = Field(description="Path") label: str = Field(description="Label") + displayPath: Optional[str] = Field(default=None, description="Full human-readable path for tooltips") @router.post("/{instanceId}/datasources") @@ -1165,6 +1166,7 @@ async def createWorkspaceDataSource( path=body.path, label=body.label, featureInstanceId=instanceId, + displayPath=body.displayPath, ) return JSONResponse(dataSource if isinstance(dataSource, dict) else dataSource.model_dump()) @@ -1214,7 +1216,7 @@ async def listFeatureConnections( userMandates = rootIf.getUserMandates(userId) if not userMandates: - return JSONResponse({"featureConnections": []}) + return JSONResponse({"featureConnectionsByMandate": []}) mandateLabels: dict = {} for um in userMandates: @@ -1226,7 +1228,7 @@ async def listFeatureConnections( except Exception: mandateLabels[um.mandateId] = um.mandateId - items = [] + byMandate: dict = {} seenIds: set = set() for um in userMandates: allInstances = rootIf.getFeatureInstancesByMandate(um.mandateId) @@ -1244,20 +1246,33 @@ async def listFeatureConnections( featureDef = catalog.getFeatureDefinition(inst.featureCode) or {} dataObjects = catalog.getDataObjects(inst.featureCode) - mLabel = mandateLabels.get(inst.mandateId, "") label = inst.label or inst.featureCode - if mLabel: - label = f"{label} ({mLabel})" - items.append({ + mid = inst.mandateId + connItem = { "featureInstanceId": inst.id, "featureCode": inst.featureCode, - "mandateId": inst.mandateId, + "mandateId": mid, "label": label, "icon": featureDef.get("icon", "mdi-database"), "tableCount": len(dataObjects), - }) + } + if mid not in byMandate: + byMandate[mid] = [] + byMandate[mid].append(connItem) - return JSONResponse({"featureConnections": items}) + def _sortKeyLabel(x: dict) -> str: + return (x.get("label") or "").lower() + + groups = [] + for mid in sorted(byMandate.keys(), key=lambda m: (mandateLabels.get(m, m) or "").lower()): + conns = sorted(byMandate[mid], key=_sortKeyLabel) + groups.append({ + "mandateId": mid, + "mandateLabel": mandateLabels.get(mid, mid), + "featureConnections": conns, + }) + + return JSONResponse({"featureConnectionsByMandate": groups}) @router.get("/{instanceId}/feature-connections/{fiId}/tables") diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py index 981a6b46..f0aedc87 100644 --- a/modules/interfaces/interfaceAiObjects.py +++ b/modules/interfaces/interfaceAiObjects.py @@ -332,6 +332,7 @@ class AiObjects: errorCount=0, toolCalls=responseToolCalls ) + response._modelMaxTokens = model.maxTokens if self.billingCallback: try: @@ -470,6 +471,7 @@ class AiObjects: errorCount=0, toolCalls=responseToolCalls, ) + response._modelMaxTokens = model.maxTokens if self.billingCallback: try: diff --git a/modules/serviceCenter/services/serviceAgent/agentLoop.py b/modules/serviceCenter/services/serviceAgent/agentLoop.py index 69fe31b2..bee03424 100644 --- a/modules/serviceCenter/services/serviceAgent/agentLoop.py +++ b/modules/serviceCenter/services/serviceAgent/agentLoop.py @@ -276,6 +276,7 @@ async def runAgentLoop( "userId": userId, "featureInstanceId": featureInstanceId, "mandateId": mandateId, + "modelMaxOutputTokens": getattr(aiResponse, "_modelMaxTokens", None) or 0, }) state.totalToolCalls += len(results) @@ -439,6 +440,29 @@ def _repairTruncatedJson(raw: str) -> Optional[Dict[str, Any]]: return None +def _validateRepairedToolArgs(toolName: str, args: Dict[str, Any]) -> Optional[str]: + """After closeJsonStructures + json.loads, args can be syntactically valid but useless (truncation + cut off before required fields). Return a user-facing _parseError message, or None if OK. + + Without this, renderDocument runs with missing `content` and only returns \"content is required\", + hiding the real cause (output token limit). + """ + if toolName == "renderDocument": + content = args.get("content") + sourceFileId = args.get("sourceFileId") + hasInline = isinstance(content, str) and bool(content.strip()) + hasFile = isinstance(sourceFileId, str) and bool(sourceFileId.strip()) + if not hasInline and not hasFile: + return ( + "Your tool call JSON was repaired after truncation, but neither `content` nor `sourceFileId` is usable. " + "Large documents must not be inlined in the tool call (output limit).\n" + "Preferred: writeFile(mode='create') + writeFile(mode='append') to build a .md file, then " + "renderDocument(sourceFileId=, outputFormat='pdf', title='...') — the tool call stays small.\n" + "Alternatives: replaceInFile for edits; shorter outline first." + ) + return None + + def _parseToolCalls(aiResponse: AiCallResponse) -> List[ToolCallRequest]: """Parse tool calls from AI response. Supports native function calling and text-based fallback.""" toolCalls = [] @@ -457,14 +481,20 @@ def _parseToolCalls(aiResponse: AiCallResponse) -> List[ToolCallRequest]: logger.warning(f"Unrecoverable truncated JSON for '{tc['function']['name']}': {rawArgs[:200]}") parsedArgs = {"_parseError": ( "Your tool call arguments were truncated (output cut off by token limit). " - "The content is too large for a single tool call. Strategies:\n" - "1. For new files: use writeFile(mode='create') with the first part, " - "then writeFile(fileId=..., mode='append') for subsequent parts (~8000 chars each).\n" - "2. For editing existing files: use replaceInFile to change only the specific parts.\n" - "3. For documentation: split into multiple smaller files." + "Do not put the full document body in renderDocument JSON.\n" + "1. writeFile(create) + writeFile(append) to a .md file, then " + "renderDocument(sourceFileId=, outputFormat=..., title=...) — tiny tool call.\n" + "2. Or replaceInFile for targeted edits.\n" + "3. Or split into multiple smaller files." )} else: logger.info(f"Repaired truncated JSON for '{tc['function']['name']}'") + repairIssue = _validateRepairedToolArgs(tc["function"]["name"], parsedArgs) + if repairIssue: + logger.warning( + f"Repaired JSON for '{tc['function']['name']}' still invalid for execution: {repairIssue[:80]}..." + ) + parsedArgs = {"_parseError": repairIssue} else: parsedArgs = rawArgs if rawArgs else {} toolCalls.append(ToolCallRequest( diff --git a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py index cec64813..03b8598e 100644 --- a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py +++ b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py @@ -259,7 +259,9 @@ class AgentService: "Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, " "or `describeImage(fileId)` for image analysis.\n" "For folders, use `listFiles(folderId)` to get the files inside, then `readFile(fileId)` for each.\n" - "When generating documents with `renderDocument`, embed images using `![alt text](file:fileId)` in the markdown content.\n\n" + "For large PDFs/DOCX, avoid huge `renderDocument` tool JSON: build markdown with " + "`writeFile` (create + append), then `renderDocument(sourceFileId=that file id, outputFormat=...)`.\n" + "For small docs you may pass `content` inline. Embed images with `![alt](file:fileId)` in markdown.\n\n" ) header += "\n\n".join(fileDescriptions) return f"{header}\n\n---\n\nUser request: {prompt}" @@ -2209,13 +2211,75 @@ def _registerCoreTools(registry: ToolRegistry, services): async def _renderDocument(args: Dict[str, Any], context: Dict[str, Any]): """Render agent-produced markdown content into any document format via the RendererRegistry.""" import re as _re + sourceFileId = (args.get("sourceFileId") or "").strip() content = args.get("content", "") + if not isinstance(content, str): + content = str(content) if content is not None else "" outputFormat = args.get("outputFormat", "pdf") title = args.get("title", "Document") language = args.get("language", "de") - if not content: - return ToolResult(toolCallId="", toolName="renderDocument", success=False, error="content is required") + if sourceFileId: + try: + dbMgmt = services.chat.interfaceDbComponent + fileRow = dbMgmt.getFile(sourceFileId) + if not fileRow: + return ToolResult( + toolCallId="", + toolName="renderDocument", + success=False, + error=f"sourceFileId not found: {sourceFileId}", + ) + rawBytes = dbMgmt.getFileData(sourceFileId) + if not rawBytes: + return ToolResult( + toolCallId="", + toolName="renderDocument", + success=False, + error=f"sourceFileId has no data: {sourceFileId}", + ) + try: + content = rawBytes.decode("utf-8") + except UnicodeDecodeError: + content = rawBytes.decode("latin-1", errors="replace") + except Exception as e: + return ToolResult( + toolCallId="", + toolName="renderDocument", + success=False, + error=f"Could not read sourceFileId: {e}", + ) + + if not (content or "").strip(): + return ToolResult( + toolCallId="", + toolName="renderDocument", + success=False, + error=( + "Provide non-empty `content` (markdown) or `sourceFileId` (id of a .md/.txt from writeFile). " + "For long documents use writeFile create+append, then renderDocument(sourceFileId=...)." + ), + ) + + modelMaxTokens = context.get("modelMaxOutputTokens", 0) + _inlineCharLimit = int(modelMaxTokens * 3 * 0.5) if modelMaxTokens > 0 else 6000 + _inlineCharLimit = max(_inlineCharLimit, 3000) + + if not sourceFileId and len(content) > _inlineCharLimit: + return ToolResult( + toolCallId="", + toolName="renderDocument", + success=False, + error=( + f"Inline `content` is {len(content)} chars — over the {_inlineCharLimit} char limit " + f"(derived from model output budget of {modelMaxTokens} tokens). " + "Large documents must use the file path:\n" + "1. writeFile(mode='create', name='draft.md', content=)\n" + "2. writeFile(mode='append', fileId=, content=) — repeat as needed\n" + "3. renderDocument(sourceFileId=, outputFormat='pdf', title='...')\n" + "This avoids output truncation entirely." + ), + ) try: structuredContent = _markdownToDocumentJson(content, title, language) @@ -2321,20 +2385,26 @@ def _registerCoreTools(registry: ToolRegistry, services): registry.register( "renderDocument", _renderDocument, description=( - "Render markdown content into a document file (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT). " - "You write the full document content as markdown, then this tool converts and renders it. " - "To embed images from uploaded files, use markdown image syntax with the file ID: ![alt text](file:fileId). " - "The images will be resolved from the Knowledge Store and embedded in the output document." + "Render markdown into a document file (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT). " + "For long documents: write markdown with writeFile (mode=create then append chunks), then call this tool with " + "`sourceFileId` only (tiny JSON — avoids model output truncation). For short docs you may pass `content` inline. " + "Images: ![alt text](file:fileId) in the markdown." ), parameters={ "type": "object", "properties": { - "content": {"type": "string", "description": "Full document content as markdown (headings, tables, lists, code blocks, paragraphs, images via ![alt](file:fileId))"}, + "content": { + "type": "string", + "description": "Full markdown inline. Prefer `sourceFileId` when the document is large (many KB).", + }, + "sourceFileId": { + "type": "string", + "description": "Chat file id of markdown saved via writeFile (create+append). Use this instead of `content` for long PDFs.", + }, "outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"}, "title": {"type": "string", "description": "Document title", "default": "Document"}, "language": {"type": "string", "description": "Document language (ISO 639-1)", "default": "de"}, }, - "required": ["content"], }, readOnly=False, ) diff --git a/modules/serviceCenter/services/serviceChat/mainServiceChat.py b/modules/serviceCenter/services/serviceChat/mainServiceChat.py index 3ec1c504..5cc1eb66 100644 --- a/modules/serviceCenter/services/serviceChat/mainServiceChat.py +++ b/modules/serviceCenter/services/serviceChat/mainServiceChat.py @@ -508,7 +508,7 @@ class ChatService: def createDataSource( self, connectionId: str, sourceType: str, path: str, label: str, - featureInstanceId: str = None + featureInstanceId: str = None, displayPath: str = None, ) -> Dict[str, Any]: """Create a new external data source reference.""" from modules.datamodels.datamodelDataSource import DataSource @@ -517,6 +517,7 @@ class ChatService: sourceType=sourceType, path=path, label=label, + displayPath=displayPath, featureInstanceId=featureInstanceId or self._context.feature_instance_id or "", mandateId=self._context.mandate_id or "", userId=self.user.id if self.user else "", diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py index 850f6aa8..733b9ade 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py @@ -281,7 +281,7 @@ class RendererDocx(BaseRenderer): def _getDefaultStyleSet(self) -> Dict[str, Any]: """Default DOCX style set - used when no style instructions present.""" return { - "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"}, + "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "left"}, "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"}, "heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"}, "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"}, @@ -349,11 +349,11 @@ class RendererDocx(BaseRenderer): para.runs[0].italic = True continue elif element_type == "extracted_text": - # Extracted text format - render as paragraph content = element.get("content", "") source = element.get("source", "") if content: - para = doc.add_paragraph(content) + para = doc.add_paragraph() + self._addMarkdownInlineRuns(para, content) if source: para.add_run(f" (Source: {source})").italic = True continue @@ -406,6 +406,37 @@ class RendererDocx(BaseRenderer): # Add error paragraph as fallback error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]") + # ── Markdown inline → python-docx runs ────────────────────────────── + _MD_INLINE_RE = re.compile( + r"(\*\*(.+?)\*\*)" # group 1,2: bold + r"|(__(.+?)__)" # group 3,4: bold (underscore) + r"|(? None: + """Parse markdown inline formatting and add corresponding Runs to a python-docx paragraph.""" + pos = 0 + for m in self._MD_INLINE_RE.finditer(text): + if m.start() > pos: + paragraph.add_run(text[pos:m.start()]) + if m.group(2): + paragraph.add_run(m.group(2)).bold = True + elif m.group(4): + paragraph.add_run(m.group(4)).bold = True + elif m.group(5): + paragraph.add_run(m.group(5)).italic = True + elif m.group(6): + paragraph.add_run(m.group(6)).italic = True + elif m.group(7): + run = paragraph.add_run(m.group(7)) + run.font.name = "Courier New" + run.font.size = Pt(9) + pos = m.end() + if pos < len(text): + paragraph.add_run(text[pos:]) + def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """ Render a JSON table to DOCX using AI-generated styles. @@ -480,9 +511,8 @@ class RendererDocx(BaseRenderer): tblW.set(qn('w:w'), '0') tblPr.append(tblW) - # Center alignment jc = OxmlElement('w:jc') - jc.set(qn('w:val'), 'center') + jc.set(qn('w:val'), 'left') tblPr.append(jc) # Apply table borders directly (works without template styles) @@ -821,10 +851,11 @@ class RendererDocx(BaseRenderer): text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16)) for item in items: - if isinstance(item, str): - para = doc.add_paragraph(item, style='List Bullet') - elif isinstance(item, dict) and "text" in item: - para = doc.add_paragraph(item["text"], style='List Bullet') + itemText = item if isinstance(item, str) else (item.get("text", "") if isinstance(item, dict) else "") + if not itemText: + continue + para = doc.add_paragraph(style='List Bullet') + self._addMarkdownInlineRuns(para, itemText) # Apply bullet list styling from style set - use cached objects if bullet_style and para.runs: @@ -849,7 +880,6 @@ class RendererDocx(BaseRenderer): def _renderJsonHeading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON heading to DOCX using AI-generated styles.""" try: - # Extract from nested content structure content = heading_data.get("content", {}) if not isinstance(content, dict): return @@ -858,13 +888,13 @@ class RendererDocx(BaseRenderer): if text: level = max(1, min(6, level)) - # Use custom heading style if available, otherwise use built-in - style_name = f"Heading {level}" if level <= 2 else "Heading 1" + # python-docx supports Heading 1 – Heading 9 as built-in styles try: - para = doc.add_paragraph(text, style=style_name) - except KeyError: - # Fallback to built-in heading if custom style doesn't exist - doc.add_heading(text, level=level) + para = doc.add_heading("", level=level) + para.clear() + self._addMarkdownInlineRuns(para, text) + except (KeyError, ValueError): + para = doc.add_paragraph(text) except Exception as e: self.logger.warning(f"Error rendering heading: {str(e)}") @@ -893,8 +923,8 @@ class RendererDocx(BaseRenderer): return if text: - para = doc.add_paragraph(text) - # Apply paragraph styling from style set - OPTIMIZED: pre-calculate style objects + para = doc.add_paragraph() + self._addMarkdownInlineRuns(para, text) paragraph_style = styles.get("paragraph", {}) if paragraph_style: # Pre-calculate and cache style objects @@ -1345,7 +1375,7 @@ class RendererDocx(BaseRenderer): # Create table table = doc.add_table(rows=len(table_data), cols=len(table_data[0])) - table.alignment = WD_TABLE_ALIGNMENT.CENTER + table.alignment = WD_TABLE_ALIGNMENT.LEFT # Add data to table for row_idx, row_data in enumerate(table_data): diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py index 6cbc8a9c..a5c9dc93 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py @@ -4,6 +4,10 @@ PDF renderer for report generation using reportlab. """ +from __future__ import annotations + +import unicodedata + from .documentRendererBaseTemplate import BaseRenderer from modules.datamodels.datamodelDocument import RenderedDocument from typing import Dict, Any, List, Optional @@ -11,8 +15,8 @@ import io import base64 try: - from reportlab.lib.pagesizes import letter, A4 - from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak + from reportlab.lib.pagesizes import A4 + from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Preformatted from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.units import inch from reportlab.lib import colors @@ -21,6 +25,53 @@ try: except ImportError: REPORTLAB_AVAILABLE = False +import re as _re_pdf + +# A4 width in pt; margins must match SimpleDocTemplate(leftMargin/rightMargin) +_PDF_MARGIN_LR_PT = 72.0 +_PDF_A4_WIDTH_PT = 595.27 +_PDF_CONTENT_WIDTH_PT = _PDF_A4_WIDTH_PT - (2 * _PDF_MARGIN_LR_PT) + + +def _boxDrawingCharToAscii(ch: str) -> str: + """Map one box-drawing character to ASCII (Courier has no glyphs for U+2500–U+257F).""" + nm = unicodedata.name(ch, "") + v = "VERTICAL" in nm + h = "HORIZONTAL" in nm + and_ = "AND" in nm + if v and h: + return "+" + if v and not h and not and_: + return "|" + if h and not v and not and_: + return "-" + return "+" + + +def _normalizePdfMonospaceText(text: str) -> str: + """Replace Unicode box/block drawing with ASCII so PDF core fonts render readable code/trees.""" + if not text: + return "" + out: List[str] = [] + for ch in text: + o = ord(ch) + if 0x2500 <= o <= 0x257F: + out.append(_boxDrawingCharToAscii(ch)) + elif 0x2580 <= o <= 0x259F: + out.append("#") + else: + out.append(ch) + return "".join(out) + + +def _prepareCodeBlockPlainText(text: str) -> str: + """Normalize newlines/tabs for preformatted code (no HTML/XML; spaces must stay significant).""" + if not text: + return "" + text = text.replace("\r\n", "\n").replace("\r", "\n") + return text.expandtabs(4) + + class RendererPdf(BaseRenderer): """Renders content to PDF format using reportlab.""" @@ -122,15 +173,6 @@ class RendererPdf(BaseRenderer): # Extract sections and metadata from standardized schema sections = self._extractSections(json_content) - metadata = self._extractMetadata(json_content) - - # Use provided title (which comes from documents[].title) as primary source - # Fallback to metadata.title only if title parameter is empty - document_title = title if title else metadata.get("title", "Generated Document") - - # Make title shorter to prevent wrapping/overlapping - if len(document_title) > 40: - document_title = "PowerOn - Consent Agreement" # Create a buffer to hold the PDF buffer = io.BytesIO() @@ -145,17 +187,9 @@ class RendererPdf(BaseRenderer): bottomMargin=18 ) - # Build PDF content + # Build PDF content (no cover page — body starts on page 1; filename still uses `title`) story = [] - # Title page - title_style = self._createTitleStyle(styles) - story.append(Paragraph(document_title, title_style)) - story.append(Spacer(1, 50)) # Increased spacing to prevent overlap - story.append(Paragraph(f"Generated: {self._formatTimestamp()}", self._createNormalStyle(styles))) - story.append(Spacer(1, 30)) # Add spacing before page break - story.append(PageBreak()) - # Process each section (sections already extracted above) self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER") for i, section in enumerate(sections): @@ -164,10 +198,9 @@ class RendererPdf(BaseRenderer): self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER") story.extend(section_elements) - # Build PDF - doc.build(story) + # Build PDF — retry with oversized flowables removed on LayoutError + self._buildPdfWithOverflowGuard(doc, story, buffer) - # Get PDF content as base64 buffer.seek(0) pdf_bytes = buffer.getvalue() pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') @@ -177,6 +210,42 @@ class RendererPdf(BaseRenderer): except Exception as e: self.logger.error(f"Error generating PDF from JSON: {str(e)}") raise Exception(f"PDF generation failed: {str(e)}") + + def _buildPdfWithOverflowGuard(self, doc, story: List[Any], buffer) -> None: + """Try doc.build(); on 'too large on page' LayoutError, drop the offending + flowable, log a warning, and retry (up to 5 times).""" + maxRetries = 5 + for attempt in range(maxRetries + 1): + try: + buffer.seek(0) + buffer.truncate() + doc.build(story) + return + except Exception as e: + msg = str(e) + if "too large on page" not in msg or attempt == maxRetries: + raise + # Identify the offending flowable from the error repr + self.logger.warning(f"PDF overflow (attempt {attempt + 1}): {msg} — removing oversized element and retrying") + removed = False + for idx, flowable in enumerate(story): + fRepr = repr(flowable) + if "Table" in fRepr and hasattr(flowable, '_cellvalues'): + try: + nRows = len(flowable._cellvalues) + nCols = len(flowable._cellvalues[0]) if flowable._cellvalues else 0 + if nRows == 1 and nCols == 1: + errPara = Paragraph( + "[Code block omitted — content too large for PDF page]", + self._createNormalStyle({}), + ) + story[idx] = errPara + removed = True + break + except Exception: + pass + if not removed: + raise async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]: """Get style set - use styles from document generation metadata if available, @@ -269,13 +338,18 @@ class RendererPdf(BaseRenderer): """Default PDF style set - used when no style instructions present.""" return { "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30}, + # Markdown #..###### — sizes must strictly decrease (H1 largest … H6 smallest). "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12}, - "heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8}, + "heading2": {"font_size": 15, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 10, "space_before": 10}, + "heading3": {"font_size": 13, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8}, + "heading4": {"font_size": 12, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 6, "space_before": 6}, + "heading5": {"font_size": 11, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 6, "space_before": 6}, + "heading6": {"font_size": 10, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 4, "space_before": 4}, "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2}, - "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center", "font_size": 12}, + "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "left", "font_size": 12}, "table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10}, "bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3}, - "code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6} + "code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6, "align": "left"} } async def _getAiStylesWithPdfColors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]: @@ -441,39 +515,35 @@ class RendererPdf(BaseRenderer): return color_value return default - - def _createTitleStyle(self, styles: Dict[str, Any]) -> ParagraphStyle: - """Create title style from style definitions.""" - title_style_def = styles.get("title", {}) - - # DEBUG: Show what color and spacing is being used for title - title_color = title_style_def.get("color", "#1F4E79") - title_space_after = title_style_def.get("space_after", 30) - self.services.utils.debugLogToFile(f"PDF TITLE COLOR: {title_color} -> {self._hexToColor(title_color)}", "PDF_RENDERER") - self.services.utils.debugLogToFile(f"PDF TITLE SPACE_AFTER: {title_space_after}", "PDF_RENDERER") - - return ParagraphStyle( - 'CustomTitle', - fontSize=title_style_def.get("font_size", 20), # Reduced from 24 to 20 - spaceAfter=title_style_def.get("space_after", 30), - alignment=self._getAlignment(title_style_def.get("align", "center")), - textColor=self._hexToColor(title_color), - leading=title_style_def.get("font_size", 20) * 1.4, # Add line spacing for multi-line titles - spaceBefore=0 # Ensure no space before title - ) - + def _defaultHeadingStyleDef(self, level: int) -> Dict[str, Any]: + """When heading{N} is missing from styles, never fall back to heading1 (that made H3 > H2).""" + sizes = {1: 18, 2: 15, 3: 13, 4: 12, 5: 11, 6: 10} + fs = sizes.get(level, 10) + sb = max(4, 14 - level) + return { + "font_size": fs, + "color": "#2F2F2F" if level <= 2 else "#4F4F4F", + "bold": True, + "align": "left", + "space_after": sb, + "space_before": sb, + } + def _createHeadingStyle(self, styles: Dict[str, Any], level: int) -> ParagraphStyle: """Create heading style from style definitions.""" heading_key = f"heading{level}" - heading_style_def = styles.get(heading_key, styles.get("heading1", {})) - + heading_style_def = styles.get(heading_key) or self._defaultHeadingStyleDef(level) + fs = heading_style_def.get("font_size", self._defaultHeadingStyleDef(level)["font_size"]) + bold = heading_style_def.get("bold", True) return ParagraphStyle( f'CustomHeading{level}', - fontSize=heading_style_def.get("font_size", 18 - level * 2), + fontName="Helvetica-Bold" if bold else "Helvetica", + fontSize=fs, spaceAfter=heading_style_def.get("space_after", 12), spaceBefore=heading_style_def.get("space_before", 12), alignment=self._getAlignment(heading_style_def.get("align", "left")), - textColor=self._hexToColor(heading_style_def.get("color", "#2F2F2F")) + textColor=self._hexToColor(heading_style_def.get("color", "#2F2F2F")), + leading=fs * 1.35, ) def _createNormalStyle(self, styles: Dict[str, Any]) -> ParagraphStyle: @@ -505,22 +575,6 @@ class RendererPdf(BaseRenderer): } return align_map.get(align.lower().strip(), TA_LEFT) - def _getTableAlignment(self, align: str) -> str: - """Convert alignment string to ReportLab table alignment string.""" - if not align or not isinstance(align, str): - return 'LEFT' - - align_map = { - "center": 'CENTER', - "left": 'LEFT', - "justify": 'LEFT', # Tables don't support justify, use LEFT - "right": 'RIGHT', - "0": 'LEFT', # Handle numeric strings - "1": 'CENTER', - "2": 'LEFT' # Tables don't support justify, use LEFT - } - return align_map.get(align.lower().strip(), 'LEFT') - def _hexToColor(self, hex_color: str) -> colors.Color: """Convert hex color to reportlab color.""" try: @@ -542,7 +596,66 @@ class RendererPdf(BaseRenderer): return colors.black except: return colors.black - + + def _escapeReportlabXml(self, text: str) -> str: + """Escape text for ReportLab Paragraph markup.""" + if not text: + return "" + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + ) + + def _applyInlineMarkdownToEscapedPlain(self, text: str) -> str: + """Escape XML then apply bold/italic to a segment with no `code` spans (code is handled separately).""" + if not text: + return "" + s = self._escapeReportlabXml(text) + s = _re_pdf.sub(r"\*\*(.+?)\*\*", r"\1", s, flags=_re_pdf.DOTALL) + s = _re_pdf.sub(r"__(.+?)__", r"\1", s, flags=_re_pdf.DOTALL) + s = _re_pdf.sub(r"(?\1", s) + s = _re_pdf.sub(r"(?\1", s) + return s + + def _markdownInlineToReportlabXml(self, text: str) -> str: + """Turn common markdown inline (**bold**, *italic*, `code`) into ReportLab XML. + Backtick spans are extracted first so paths like `...//...` are not corrupted by + markdown patterns and XML escaping stays well-formed inside . + """ + if not text: + return "" + text = _normalizePdfMonospaceText(text) + out: List[str] = [] + pos = 0 + for m in _re_pdf.finditer(r"`([^`]*)`", text): + before = text[pos:m.start()] + out.append(self._applyInlineMarkdownToEscapedPlain(before)) + code = m.group(1) + out.append(f'{self._escapeReportlabXml(code)}') + pos = m.end() + out.append(self._applyInlineMarkdownToEscapedPlain(text[pos:])) + return "".join(out) + + def _paragraphFromInlineMarkdown(self, text: str, style: ParagraphStyle) -> Paragraph: + return Paragraph(self._markdownInlineToReportlabXml(text), style) + + def _createTableCellParagraphStyle( + self, styles: Dict[str, Any], *, header: bool, tableStyleKey: str + ) -> ParagraphStyle: + """Paragraph style for table cells (word wrap within colWidth).""" + tdef = styles.get(tableStyleKey, {}) + fs = tdef.get("font_size", 12 if header else 10) + defaultTc = "#FFFFFF" if header else "#2F2F2F" + return ParagraphStyle( + f"TblCell{'H' if header else 'B'}{tableStyleKey}", + fontSize=fs, + leading=fs * 1.25, + alignment=TA_LEFT, + textColor=self._hexToColor(tdef.get("text_color", defaultTc)), + fontName="Helvetica-Bold" if header and tdef.get("bold", True) else "Helvetica", + ) + def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a single JSON section to PDF elements using AI-generated styles. Supports three content formats: reference, object (base64), extracted_text. @@ -575,8 +688,10 @@ class RendererPdf(BaseRenderer): content = element.get("content", "") source = element.get("source", "") if content: - source_text = f" (Source: {source})" if source else "" - all_elements.append(Paragraph(f"{content}{source_text}", self._createNormalStyle(styles))) + bodyXml = self._markdownInlineToReportlabXml(content) + if source: + bodyXml = f"{bodyXml} (Source: {self._escapeReportlabXml(source)})" + all_elements.append(Paragraph(bodyXml, self._createNormalStyle(styles))) all_elements.append(Spacer(1, 6)) continue @@ -618,10 +733,8 @@ class RendererPdf(BaseRenderer): return [Paragraph(f"[Error rendering section: {str(e)}]", self._createNormalStyle(styles))] def _renderJsonTable(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: - """Render a JSON table to PDF elements using AI-generated styles.""" + """Render a JSON table: left-aligned, width capped to printable area, cells wrap.""" try: - # Handle nested content structure: element.content.headers vs element.headers - # Extract from nested content structure content = table_data.get("content", {}) if not isinstance(content, dict): return [] @@ -631,30 +744,43 @@ class RendererPdf(BaseRenderer): if not headers or not rows: return [] - # Prepare table data - table_data_list = [headers] + rows - - # Create table - table = Table(table_data_list) - - # Apply styling + numCols = len(headers) + colWidth = _PDF_CONTENT_WIDTH_PT / max(numCols, 1) + colWidths = [colWidth] * numCols + + hdrPs = self._createTableCellParagraphStyle(styles, header=True, tableStyleKey="table_header") + cellPs = self._createTableCellParagraphStyle(styles, header=False, tableStyleKey="table_cell") + + def _cellPara(val, ps): + return self._paragraphFromInlineMarkdown(str(val) if val is not None else "", ps) + + headerRow = [_cellPara(h, hdrPs) for h in headers] + bodyRows = [] + for row in rows: + padded = list(row) + [""] * max(0, numCols - len(row)) + padded = padded[:numCols] + bodyRows.append([_cellPara(c, cellPs) for c in padded]) + + table_matrix = [headerRow] + bodyRows + table = Table(table_matrix, colWidths=colWidths, repeatRows=1) + table_header_style = styles.get("table_header", {}) table_cell_style = styles.get("table_cell", {}) - + table_style = [ - ('BACKGROUND', (0, 0), (-1, 0), self._hexToColor(table_header_style.get("background", "#4F4F4F"))), - ('TEXTCOLOR', (0, 0), (-1, 0), self._hexToColor(table_header_style.get("text_color", "#FFFFFF"))), - ('ALIGN', (0, 0), (-1, -1), self._getTableAlignment(table_cell_style.get("align", "left"))), - ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold' if table_header_style.get("bold", True) else 'Helvetica'), - ('FONTSIZE', (0, 0), (-1, 0), table_header_style.get("font_size", 12)), - ('BOTTOMPADDING', (0, 0), (-1, 0), 12), - ('BACKGROUND', (0, 1), (-1, -1), self._hexToColor(table_cell_style.get("background", "#FFFFFF"))), - ('FONTSIZE', (0, 1), (-1, -1), table_cell_style.get("font_size", 10)), - ('GRID', (0, 0), (-1, -1), 1, colors.black) + ("BACKGROUND", (0, 0), (-1, 0), self._hexToColor(table_header_style.get("background", "#4F4F4F"))), + ("BACKGROUND", (0, 1), (-1, -1), self._hexToColor(table_cell_style.get("background", "#FFFFFF"))), + ("ALIGN", (0, 0), (-1, -1), "LEFT"), + ("VALIGN", (0, 0), (-1, -1), "TOP"), + ("LEFTPADDING", (0, 0), (-1, -1), 4), + ("RIGHTPADDING", (0, 0), (-1, -1), 4), + ("TOPPADDING", (0, 0), (-1, 0), 6), + ("BOTTOMPADDING", (0, 0), (-1, 0), 8), + ("TOPPADDING", (0, 1), (-1, -1), 4), + ("BOTTOMPADDING", (0, 1), (-1, -1), 4), + ("GRID", (0, 0), (-1, -1), 0.5, colors.black), ] - table.setStyle(TableStyle(table_style)) - return [table, Spacer(1, 12)] except Exception as e: @@ -674,9 +800,16 @@ class RendererPdf(BaseRenderer): elements = [] for item in items: if isinstance(item, str): - elements.append(Paragraph(f"• {item}", self._createNormalStyle(styles))) + elements.append( + Paragraph(f"• {self._markdownInlineToReportlabXml(item)}", self._createNormalStyle(styles)) + ) elif isinstance(item, dict) and "text" in item: - elements.append(Paragraph(f"• {item['text']}", self._createNormalStyle(styles))) + elements.append( + Paragraph( + f"• {self._markdownInlineToReportlabXml(item['text'])}", + self._createNormalStyle(styles), + ) + ) if elements: elements.append(Spacer(1, bullet_style_def.get("space_after", 3))) @@ -700,7 +833,7 @@ class RendererPdf(BaseRenderer): if text: level = max(1, min(6, level)) heading_style = self._createHeadingStyle(styles, level) - return [Paragraph(text, heading_style)] + return [self._paragraphFromInlineMarkdown(text, heading_style)] return [] @@ -721,7 +854,7 @@ class RendererPdf(BaseRenderer): text = "" if text: - return [Paragraph(text, self._createNormalStyle(styles))] + return [self._paragraphFromInlineMarkdown(text, self._createNormalStyle(styles))] return [] @@ -741,27 +874,81 @@ class RendererPdf(BaseRenderer): code_style_def = styles.get("code_block", {}) if code: + code = _prepareCodeBlockPlainText(code) + code = _normalizePdfMonospaceText(code) elements = [] - + fs = code_style_def.get("font_size", 9) + mono = code_style_def.get("font", "Courier") + if language: lang_style = ParagraphStyle( - 'CodeLanguage', - fontSize=code_style_def.get("font_size", 9), + "CodeLanguage", + fontSize=fs, textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")), - fontName='Helvetica-Bold' + fontName="Helvetica-Bold", + alignment=TA_LEFT, ) - elements.append(Paragraph(f"Code ({language}):", lang_style)) - - code_style = ParagraphStyle( - 'CodeBlock', - fontSize=code_style_def.get("font_size", 9), - textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")), - fontName=code_style_def.get("font", "Courier"), - backColor=self._hexToColor(code_style_def.get("background", "#F5F5F5")), - spaceAfter=code_style_def.get("space_after", 6) - ) - elements.append(Paragraph(code, code_style)) - + elements.append( + Paragraph( + self._escapeReportlabXml(f"Code ({language}):"), + lang_style, + ) + ) + + approxCharWPt = max(fs * 0.52, 4.5) + usableWidth = _PDF_CONTENT_WIDTH_PT - 16 # left+right padding + maxLineChars = max(48, int(usableWidth / approxCharWPt)) + bg_col = self._hexToColor(code_style_def.get("background", "#F5F5F5")) + leading = fs * 1.2 + spaceAfter = code_style_def.get("space_after", 6) + + # Each source line may wrap to ceil(len/maxLineChars) visual lines. + # Frame height ~740pt minus padding → keep rendered height < 600pt. + maxVisualLinesPerChunk = max(8, int(600 / leading)) + srcLines = code.split("\n") + chunks: List[List[str]] = [] + curChunk: List[str] = [] + curVisual = 0 + for sl in srcLines: + wrapped = max(1, -(-len(sl) // maxLineChars)) if sl else 1 + if curVisual + wrapped > maxVisualLinesPerChunk and curChunk: + chunks.append(curChunk) + curChunk = [] + curVisual = 0 + curChunk.append(sl) + curVisual += wrapped + if curChunk: + chunks.append(curChunk) + + for ci, chunkLines in enumerate(chunks): + chunkText = "\n".join(chunkLines) + styleId = f"CodePre_{id(code_data) & 0xFFFFFFFF}_{ci}" + codePrStyle = ParagraphStyle( + styleId, + fontName=mono, + fontSize=fs, + leading=leading, + textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")), + alignment=TA_LEFT, + leftIndent=0, + rightIndent=0, + ) + pf = Preformatted(chunkText, codePrStyle, dedent=0, maxLineLength=maxLineChars) + tbl = Table([[pf]], colWidths=[_PDF_CONTENT_WIDTH_PT]) + tbl.setStyle( + TableStyle( + [ + ("BACKGROUND", (0, 0), (-1, -1), bg_col), + ("VALIGN", (0, 0), (-1, -1), "TOP"), + ("LEFTPADDING", (0, 0), (-1, -1), 8), + ("RIGHTPADDING", (0, 0), (-1, -1), 8), + ("TOPPADDING", (0, 0), (-1, -1), 6), + ("BOTTOMPADDING", (0, 0), (-1, -1), 6), + ] + ) + ) + tbl.spaceAfter = 0 if ci < len(chunks) - 1 else spaceAfter + elements.append(tbl) return elements return [] diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py index 800b21ba..3bdff7f1 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py @@ -13,6 +13,15 @@ from modules.datamodels.datamodelDocument import RenderedDocument logger = logging.getLogger(__name__) +_PPTX_MD_INLINE_RE = re.compile( + r"(\*\*(.+?)\*\*)" + r"|(__(.+?)__)" + r"|(? 0 hasImages = len(slide_images) > 0 + isTitleSlide = slide_data.get("_isTitleSlide", False) - logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - sections: {len(slide_sections)}, images: {len(slide_images)}, content: {len(slide_content)} chars") + logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - sections: {len(slide_sections)}, images: {len(slide_images)}, content: {len(slide_content)} chars, titleSlide={isTitleSlide}") - # Use blank layout for all slides to avoid placeholder interference - # Find blank layout (typically index 6, fallback to 5) + # Title slide uses the built-in Title Slide layout (index 0) + if isTitleSlide: + titleLayout = prs.slide_layouts[0] + slide = prs.slides.add_slide(titleLayout) + try: + titleShape = slide.shapes.title + titleShape.text = slide_data.get("title", "") + titleStyle = styles.get("title", {}) + tf = titleShape.text_frame + if tf.paragraphs: + p = tf.paragraphs[0] + p.font.size = Pt(titleStyle.get("font_size", 36)) + p.font.bold = titleStyle.get("bold", True) + tColor = self._getSafeColor(titleStyle.get("color", (31, 78, 121))) + p.font.color.rgb = RGBColor(*tColor) + except Exception as titleErr: + logger.warning(f"Could not style title slide: {titleErr}") + # Clear subtitle placeholder + try: + sub = slide.placeholders[1] + sub.text = "" + except (KeyError, IndexError): + pass + continue + + # Content slides: use blank layout slideLayoutIndex = None for idx in [6, 5]: if idx < len(prs.slide_layouts): try: layout = prs.slide_layouts[idx] - # Check if it's a blank layout (no placeholders) if len(layout.placeholders) == 0: slideLayoutIndex = idx break except (AttributeError, IndexError): continue - # If no blank layout found, use layout with fewest placeholders if slideLayoutIndex is None: - min_placeholders = float('inf') + minPh = float('inf') for idx in range(len(prs.slide_layouts)): try: layout = prs.slide_layouts[idx] - placeholder_count = len(layout.placeholders) if hasattr(layout, 'placeholders') else 0 - if placeholder_count < min_placeholders: - min_placeholders = placeholder_count + phCount = len(layout.placeholders) if hasattr(layout, 'placeholders') else 0 + if phCount < minPh: + minPh = phCount slideLayoutIndex = idx except: continue - # Fallback to first layout if still None if slideLayoutIndex is None: slideLayoutIndex = 0 slide_layout = prs.slide_layouts[slideLayoutIndex] slide = prs.slides.add_slide(slide_layout) - # Clear placeholder text instead of removing placeholders (safer approach) - # This avoids corrupting the PPTX file structure try: for shape in slide.shapes: if hasattr(shape, 'is_placeholder') and shape.is_placeholder: try: if hasattr(shape, 'text_frame'): shape.text_frame.clear() - # Set text to empty string to remove "Click to add text" if len(shape.text_frame.paragraphs) > 0: shape.text_frame.paragraphs[0].text = "" except: @@ -156,7 +184,7 @@ class RendererPptx(BaseRenderer): except Exception as placeholder_error: logger.warning(f"Could not clear placeholders: {str(placeholder_error)}") - # Add title as textbox (smaller size for slides) + # Add title as textbox from pptx.util import Inches titleBox = slide.shapes.add_textbox(Inches(0.5), Inches(0.2), prs.slide_width - Inches(1), Inches(0.6)) titleFrame = titleBox.text_frame @@ -232,15 +260,14 @@ class RendererPptx(BaseRenderer): else: p.alignment = PP_ALIGN.LEFT - # If no slides were created, create a default slide + # If no slides were created, create a single slide with the document title if not slidesData: - slide_layout = prs.slide_layouts[0] # Title slide layout + slide_layout = prs.slide_layouts[0] slide = prs.slides.add_slide(slide_layout) title_shape = slide.shapes.title title_shape.text = title - # Apply title styling to default slide title_style = styles.get("title", {}) if title_shape.text_frame.paragraphs[0].font: title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48)) @@ -248,16 +275,12 @@ class RendererPptx(BaseRenderer): title_color = self._getSafeColor(title_style.get("color", (31, 78, 121))) title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color) - subtitle_shape = slide.placeholders[1] - subtitle_shape.text = "Generated by PowerOn AI System" - - # Apply subtitle styling - paragraph_style = styles.get("paragraph", {}) - if subtitle_shape.text_frame.paragraphs[0].font: - subtitle_shape.text_frame.paragraphs[0].font.size = Pt(paragraph_style.get("font_size", 20)) - subtitle_shape.text_frame.paragraphs[0].font.bold = paragraph_style.get("bold", False) - paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47))) - subtitle_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*paragraph_color) + # Clear subtitle placeholder instead of adding filler text + try: + subtitle_shape = slide.placeholders[1] + subtitle_shape.text = "" + except (KeyError, IndexError): + pass # Save to buffer buffer = io.BytesIO() @@ -625,24 +648,23 @@ JSON ONLY. NO OTHER TEXT.""" sections = self._extractSections(json_content) metadata = self._extractMetadata(json_content) - # Use provided title (which comes from documents[].title) as primary source - # Fallback to metadata.title only if title parameter is empty document_title = title if title else metadata.get("title", "Generated Document") - # Create title slide + # Title slide (clean — just the document title, no filler text) slides.append({ "title": document_title, - "content": "Generated by PowerOn AI System\n\n" + self._formatTimestamp() + "content": "", + "_isTitleSlide": True, }) - # Process sections into slides based on content and user intent - slides.extend(self._createSlidesFromSections(sections, styles)) - - # If no content slides were created, create a default content slide - if len(slides) == 1: # Only title slide + # Content slides split by chapter headings + contentSlides = self._createSlidesFromSections(sections, styles) + if contentSlides: + slides.extend(contentSlides) + else: slides.append({ "title": "Content Overview", - "content": "No structured content found in the source documents.\n\nPlease check the source documents and try again." + "content": "" }) return slides @@ -941,9 +963,8 @@ JSON ONLY. NO OTHER TEXT.""" content = slide_data.get("content", "") title = slide_data.get("title", "") - # Check if it's a title slide (first slide) - if not content or "Generated by PowerOn AI System" in content: - return 0 # Title slide layout + if not content: + return 0 # Professional layout selection based on content if "|" in content and "-" in content: @@ -970,67 +991,71 @@ JSON ONLY. NO OTHER TEXT.""" return 1 # Default to title and content layout def _createSlidesFromSections(self, sections: List[Dict[str, Any]], styles: Dict[str, Any]) -> List[Dict[str, Any]]: - """Create slides from sections: each heading level 1 (chapter) creates a new slide, content accumulates until next level 1 heading.""" + """Create slides from sections: each top-level heading creates a new slide. + + The split level is determined dynamically: if there is exactly one H1 (the + document title), chapters are H2; otherwise chapters are H1. + """ try: + # First pass: discover heading levels to choose the split level + headingLevels: List[int] = [] + for section in sections: + if section.get("content_type") == "heading": + for el in section.get("elements", []): + if isinstance(el, dict): + c = el.get("content", {}) + if isinstance(c, dict): + headingLevels.append(c.get("level", 1)) + + h1Count = headingLevels.count(1) + h2Count = headingLevels.count(2) + # If there's at most one H1 but multiple H2s, split on H2 + splitLevel = 2 if h1Count <= 1 and h2Count > 1 else 1 + slides = [] - current_slide_sections = [] # Store sections (not formatted text) for proper rendering - current_slide_title = "Content Overview" + currentSlideSections = [] + currentSlideTitle = "Content Overview" for section in sections: - section_type = section.get("content_type", "paragraph") + sectionType = section.get("content_type", "paragraph") elements = section.get("elements", []) - # Skip sections with no elements (unless they're headings that should create new slides) - if not elements and section_type != "heading": + if not elements and sectionType != "heading": continue - if section_type == "heading": - # Extract heading level - level = 1 # Default - heading_text = "" + if sectionType == "heading": + level = 1 + headingText = "" for element in elements: if isinstance(element, dict): - # Extract from nested content structure content = element.get("content", {}) if isinstance(content, dict): - heading_text = content.get("text", "") + headingText = content.get("text", "") level = content.get("level", 1) elif isinstance(content, str): - heading_text = content + headingText = content level = 1 - # Only level 1 headings (chapters) create new slides - if level == 1: - # If we have accumulated content, create a slide - if current_slide_sections: + if level <= splitLevel: + if currentSlideSections: slides.append({ - "title": current_slide_title, - "sections": current_slide_sections.copy(), # Store sections for proper rendering + "title": currentSlideTitle, + "sections": currentSlideSections.copy(), "images": [] }) - current_slide_sections = [] - - # Start new slide with heading as title - if heading_text: - current_slide_title = heading_text - else: - # If no heading text found but this is a heading section, use section ID or default - current_slide_title = section.get("id", "Untitled Section") + currentSlideSections = [] + currentSlideTitle = headingText or section.get("id", "Untitled Section") else: - # Level 2+ headings are added as sections to current slide - current_slide_sections.append(section) - elif section_type == "image": - # Images are added to current slide (will be organized in frames) - current_slide_sections.append(section) + currentSlideSections.append(section) + elif sectionType == "image": + currentSlideSections.append(section) else: - # Add section to current slide (will be rendered properly) - current_slide_sections.append(section) + currentSlideSections.append(section) - # Add final slide if there's content - if current_slide_sections: + if currentSlideSections: slides.append({ - "title": current_slide_title, - "sections": current_slide_sections.copy(), + "title": currentSlideTitle, + "sections": currentSlideSections.copy(), "images": [] }) @@ -1225,14 +1250,66 @@ JSON ONLY. NO OTHER TEXT.""" import traceback logger.error(f"Traceback: {traceback.format_exc()}") - def _addTableToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], top: float, max_width: float = None) -> None: + def _addMarkdownInlineRuns(self, paragraph, text: str, fontSize=None, fontColor=None, fontBold=None) -> None: + """Parse markdown inline formatting and add Runs to a pptx paragraph. + + Every piece of text is added as an explicit Run with font properties set, + so the paragraph never falls back to the slide-master default font. + """ + from pptx.util import Pt + + paragraph.text = "" + + def _applyBase(run, bold=None): + if fontSize: + run.font.size = fontSize + if fontColor: + run.font.color.rgb = fontColor + if bold is not None: + run.font.bold = bold + elif fontBold is not None: + run.font.bold = fontBold + + pos = 0 + for m in _PPTX_MD_INLINE_RE.finditer(text): + if m.start() > pos: + r = paragraph.add_run() + r.text = text[pos:m.start()] + _applyBase(r) + if m.group(2) or m.group(4): + r = paragraph.add_run() + r.text = m.group(2) or m.group(4) + _applyBase(r, bold=True) + elif m.group(5) or m.group(6): + r = paragraph.add_run() + r.text = m.group(5) or m.group(6) + r.font.italic = True + _applyBase(r) + elif m.group(7): + r = paragraph.add_run() + r.text = m.group(7) + r.font.name = "Courier New" + if fontSize and hasattr(fontSize, 'pt'): + r.font.size = Pt(max(8, int(fontSize.pt * 0.85))) + elif fontSize: + r.font.size = fontSize + if fontColor: + r.font.color.rgb = fontColor + pos = m.end() + + # Remaining tail (or entire string if no matches) + if pos < len(text): + r = paragraph.add_run() + r.text = text[pos:] + _applyBase(r) + + def _addTableToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], top: float = None, max_width: float = None) -> None: """Add a PowerPoint table to slide.""" try: from pptx.util import Inches, Pt from pptx.enum.text import PP_ALIGN from pptx.dml.color import RGBColor - # Extract from nested content structure content = element.get("content", {}) if not isinstance(content, dict): return @@ -1243,11 +1320,9 @@ JSON ONLY. NO OTHER TEXT.""" if not headers: return - # Calculate table dimensions - num_cols = int(len(headers)) # Ensure integer - num_rows = int(len(rows) + 1) # +1 for header row, ensure integer + num_cols = int(len(headers)) + num_rows = int(len(rows) + 1) left = Inches(0.5) - # Get presentation from stored reference or slide if hasattr(self, '_currentPresentation'): prs = self._currentPresentation else: @@ -1255,7 +1330,15 @@ JSON ONLY. NO OTHER TEXT.""" width = max_width if max_width is not None else (prs.slide_width - Inches(1)) row_height = Inches(0.4) - # Create table - ensure all parameters are proper types + # Auto-calculate top from existing shapes when not specified + if top is None: + maxBottom = Inches(1.5) + for shape in slide.shapes: + shapeBottom = shape.top + shape.height + if shapeBottom > maxBottom: + maxBottom = shapeBottom + top = maxBottom + Inches(0.15) + table_height = row_height * num_rows table_shape = slide.shapes.add_table(num_rows, num_cols, left, top, width, table_height) table = table_shape.table @@ -1361,109 +1444,49 @@ JSON ONLY. NO OTHER TEXT.""" logger.warning(f"Error adding table to slide: {str(e)}") def _addBulletListToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None: - """Add bullet list to slide text frame.""" + """Add bullet list to slide text frame with consistent formatting.""" try: from pptx.util import Pt from pptx.dml.color import RGBColor from pptx.enum.text import PP_ALIGN - # Extract from nested content structure content = element.get("content", {}) if not isinstance(content, dict): return - items = content.get("items", []) if not items: return - list_style = styles.get("bullet_list", {}) - base_font_size = list_style.get("font_size", 14) - calculated_size = max(10, int(base_font_size * font_size_multiplier)) # Minimum 10pt for readability + listStyle = styles.get("paragraph", {}) + fontSize = Pt(max(10, int(listStyle.get("font_size", 14) * font_size_multiplier))) + fontColor = RGBColor(*self._getSafeColor(listStyle.get("color", (47, 47, 47)))) - # Pre-calculate and cache style objects to avoid repeated parsing - font_size_pt = Pt(calculated_size) - text_color = self._getSafeColor(list_style.get("color", (47, 47, 47))) - text_color_rgb = RGBColor(*text_color) - space_before_pt = Pt(2) - space_after_pt = Pt(2) - - logger.debug(f"Rendering bullet list with {len(items)} items") - - for idx, item in enumerate(items): - try: - # Get text content first - if isinstance(item, dict): - item_text = item.get("text", "") - else: - item_text = str(item) - - # Skip empty items - if not item_text or len(item_text.strip()) == 0: - logger.debug(f"Skipping empty bullet item {idx}") - continue - - # Create new paragraph for each bullet item - p = text_frame.add_paragraph() - - # Set level to 1 for bullet points BEFORE setting text - # In python-pptx, setting level > 0 should automatically enable bullets - p.level = 1 - - # Set text content - p.text = item_text - - # Apply formatting - use cached objects - p.font.size = font_size_pt - p.font.color.rgb = text_color_rgb - p.alignment = PP_ALIGN.LEFT # Left align bullet lists - p.space_before = space_before_pt # Small spacing before - p.space_after = space_after_pt # Small spacing after - - # In python-pptx, setting level > 0 should enable bullets automatically - # However, some versions may not support paragraph_format, so we'll use manual bullets as fallback - # Always add manual bullet character to ensure visibility - if not (p.text.startswith('•') or p.text.startswith('-') or p.text.startswith('*') or p.text.startswith('◦')): - p.text = '• ' + p.text - logger.debug(f"Added manual bullet character to item {idx}") - - # Set proper indentation for multiline bullets (hanging indent) - # For multiline bullets: bullet at left margin, text indented, wrapped lines align with text - try: - # Try accessing paragraph_format - it may not exist in all python-pptx versions - if hasattr(p, 'paragraph_format'): - pf = p.paragraph_format - # Left indent: indents the entire paragraph (bullet + text) - pf.left_indent = Pt(18) - # First line indent: negative value creates hanging indent - # This brings the bullet back to the left while keeping text indented - pf.first_line_indent = Pt(-18) # Negative to create hanging indent - logger.debug(f"Set hanging indent for bullet item {idx}") - else: - # Try via _element if paragraph_format not available - try: - from pptx.util import Pt as PtUtil - pPr = p._element.get_or_add_pPr() - # Set left margin (indents entire paragraph) - pPr.left_margin = PtUtil(18) - # Set first line indent (negative for hanging indent) - pPr.first_line_indent = PtUtil(-18) - logger.debug(f"Set hanging indent via XML for bullet item {idx}") - except Exception as xml_error: - logger.debug(f"Could not set hanging indent via XML: {str(xml_error)}") - # Indentation is optional, continue without it - pass - except Exception as indent_error: - logger.debug(f"Could not set indent for item {idx}: {str(indent_error)}") - # Continue without indent - bullets will still show, but multiline won't be properly indented - - logger.debug(f"Successfully added bullet item {idx}: '{item_text[:50]}...'") - - except Exception as item_error: - logger.error(f"Error adding bullet item {idx}: {str(item_error)}", exc_info=True) - # Continue with next item even if one fails + for item in items: + itemText = item.get("text", "") if isinstance(item, dict) else str(item) + if not itemText or not itemText.strip(): continue - - logger.debug(f"Completed rendering bullet list, added {len(text_frame.paragraphs)} paragraphs") + + p = text_frame.add_paragraph() + p.level = 0 + p.alignment = PP_ALIGN.LEFT + p.space_before = Pt(2) + p.space_after = Pt(2) + + # Consistent bullet prefix + self._addMarkdownInlineRuns(p, f" • {itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False) + + # Subitems + if isinstance(item, dict): + for sub in item.get("subitems", []): + subText = sub.get("text", "") if isinstance(sub, dict) else str(sub) + if not subText: + continue + sp = text_frame.add_paragraph() + sp.level = 0 + sp.alignment = PP_ALIGN.LEFT + sp.space_before = Pt(1) + sp.space_after = Pt(1) + self._addMarkdownInlineRuns(sp, f" – {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False) except Exception as e: logger.warning(f"Error adding bullet list to slide: {str(e)}") @@ -1484,25 +1507,22 @@ JSON ONLY. NO OTHER TEXT.""" if text: p = text_frame.add_paragraph() - p.text = text - # Headings should be level 0 (no indentation) regardless of heading level p.level = 0 heading_style = styles.get("heading", {}) - # Different font sizes for different heading levels if level == 1: - base_font_size = heading_style.get("font_size", 28) # Largest for H1 + base_font_size = heading_style.get("font_size", 28) elif level == 2: - base_font_size = heading_style.get("font_size", 22) # Medium for H2 + base_font_size = heading_style.get("font_size", 22) elif level == 3: - base_font_size = heading_style.get("font_size", 18) # Smaller for H3 + base_font_size = heading_style.get("font_size", 18) else: - base_font_size = heading_style.get("font_size", 16) # Default for H4+ + base_font_size = heading_style.get("font_size", 16) - calculated_size = max(12, int(base_font_size * font_size_multiplier)) # Minimum 12pt for headings - p.font.size = Pt(calculated_size) - p.font.bold = heading_style.get("bold", True) - p.font.color.rgb = RGBColor(*self._getSafeColor(heading_style.get("color", (31, 78, 121)))) + calculated_size = max(12, int(base_font_size * font_size_multiplier)) + fSize = Pt(calculated_size) + fColor = RGBColor(*self._getSafeColor(heading_style.get("color", (31, 78, 121)))) + self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=True) # Add spacing before and after headings p.space_before = Pt(12 if level == 1 else 8) # More space before H1 p.space_after = Pt(6) # Space after heading @@ -1528,11 +1548,8 @@ JSON ONLY. NO OTHER TEXT.""" if text: p = text_frame.add_paragraph() - p.text = text - # Explicitly set level to 0 for regular paragraphs (not bullets) p.level = 0 - # Ensure no bullet formatting try: if hasattr(p, 'paragraph_format'): p.paragraph_format.bullet.type = None @@ -1540,11 +1557,12 @@ JSON ONLY. NO OTHER TEXT.""" pass paragraph_style = styles.get("paragraph", {}) - base_font_size = paragraph_style.get("font_size", 14) # Smaller default for better readability - calculated_size = max(10, int(base_font_size * font_size_multiplier)) # Minimum 10pt for readability - p.font.size = Pt(calculated_size) - p.font.bold = paragraph_style.get("bold", False) - p.font.color.rgb = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))) + base_font_size = paragraph_style.get("font_size", 14) + calculated_size = max(10, int(base_font_size * font_size_multiplier)) + fSize = Pt(calculated_size) + fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))) + fBold = paragraph_style.get("bold", False) + self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=fBold) # Add proper spacing p.space_before = Pt(6) # Space before paragraph @@ -1604,261 +1622,31 @@ JSON ONLY. NO OTHER TEXT.""" return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC") def _renderSlideContentWithFrames(self, slide, slide_sections: List[Dict[str, Any]], slide_images: List[Dict[str, Any]], styles: Dict[str, Any], prs) -> None: - """ - Organize slide content into frames for better layout. - Groups content by type (images, bullet lists, paragraphs, tables) and renders each in appropriately sized frames. - """ + """Render all sections sequentially: text/bullets/headings into a shared + textbox, tables and images as separate shapes placed below.""" try: from pptx.util import Inches, Pt - from pptx.enum.text import PP_ALIGN - from pptx.dml.color import RGBColor - - # Extract images from sections first - images_to_render = list(slide_images) if slide_images else [] - text_sections = [] - table_sections = [] - - for section in slide_sections: - section_type = section.get("content_type", "paragraph") - elements = section.get("elements", []) - - if not elements: - # Skip empty sections - continue - - # Extract images from all sections - section_has_images = False - for element in elements: - if isinstance(element, dict) and element.get("type") == "image": - content = element.get("content", {}) - base64Data = None - - # Handle different content formats - if isinstance(content, dict): - base64Data = content.get("base64Data") - altText = content.get("altText", "Image") - caption = content.get("caption", "") - elif isinstance(content, str): - # If content is a string, it might be base64 data directly - # Check if it looks like base64 - if len(content) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in content[:100]): - base64Data = content - altText = "Image" - caption = "" - else: - # Not base64, skip - continue - else: - # Try to get base64Data directly from element - base64Data = element.get("base64Data") - altText = element.get("altText", "Image") - caption = element.get("caption", "") - - if base64Data: - images_to_render.append({ - "base64Data": base64Data, - "altText": altText, - "caption": caption - }) - section_has_images = True - - # Skip image-only sections (they're already added to images_to_render) - if section_type == "image" and section_has_images: - continue - - # Categorize sections (excluding image elements) - has_table = False - non_image_elements = [] - - for element in elements: - if isinstance(element, dict): - element_type = element.get("type", "") - # Skip image elements when categorizing - if element_type == "image": - continue - if element_type == "table" or section_type == "table": - has_table = True - non_image_elements.append(element) - - # Only add sections that have non-image content - if non_image_elements: - if has_table: - # Create a copy of section without image elements for table rendering - table_section = { - **section, - "elements": non_image_elements - } - table_sections.append(table_section) - else: - # Create a copy of section without image elements for text rendering - text_section = { - **section, - "elements": non_image_elements - } - text_sections.append(text_section) - - # Calculate layout dimensions - title_height = Inches(1.5) - available_height = prs.slide_height - title_height - Inches(0.5) # Title + margin - available_width = prs.slide_width - Inches(1) # Margins + margin = Inches(0.5) - - current_y = title_height + Inches(0.3) - - # Determine layout strategy based on content types - has_images = len(images_to_render) > 0 - has_tables = len(table_sections) > 0 - has_text = len(text_sections) > 0 - - # Layout 1: Images + Text (horizontal split for landscape) - if has_images and has_text and not has_tables: - # Horizontal split: images on left, text on right (landscape format) - img_width = available_width * 0.48 - text_width = available_width * 0.48 - img_left = margin - text_left = margin + img_width + Inches(0.2) - - # Render images in left column (full height) - if images_to_render: - img_height = available_height - Inches(0.2) - self._addImagesToSlideInFrame(slide, images_to_render, styles, img_left, current_y, img_width, img_height) - - # Render text in right column (full height, adaptive font size) - if text_sections: - text_height = available_height - Inches(0.2) - self._renderTextSectionsInFrame(slide, text_sections, styles, text_left, current_y, text_width, text_height, adaptiveFontSize=True) - - # Layout 2: Tables + Text (horizontal split for landscape) - elif has_tables and has_text: - # Horizontal split: tables on left, text on right (landscape format) - table_width = available_width * 0.48 - text_width = available_width * 0.48 - table_left = margin - text_left = margin + table_width + Inches(0.2) - - # Render tables in left column (full height) - table_y = current_y - for table_section in table_sections: - elements = table_section.get("elements", []) - for element in elements: - if isinstance(element, dict) and element.get("type") == "table": - try: - self._addTableToSlide(slide, element, styles, table_y, max_width=table_width) - # Calculate actual table height - content = element.get("content", {}) - if isinstance(content, dict): - rows = content.get("rows", []) - num_rows = len(rows) + 1 # +1 for header - actual_height = Inches(0.4) * num_rows - table_y += actual_height + Inches(0.15) - else: - table_y += Inches(2) - except Exception as table_error: - logger.error(f"Error rendering table: {str(table_error)}") - # Continue with next table - break - - # Render text in right column (full height, adaptive font size) - if text_sections: - text_height = available_height - Inches(0.2) - self._renderTextSectionsInFrame(slide, text_sections, styles, text_left, current_y, text_width, text_height, adaptiveFontSize=True) - - # Layout 3: Images + Tables + Text (horizontal split for landscape) - elif has_images and has_tables and has_text: - # Horizontal split: Images (left), Tables (middle), Text (right) - img_width = available_width * 0.31 - table_width = available_width * 0.31 - text_width = available_width * 0.31 - img_left = margin - table_left = margin + img_width + Inches(0.15) - text_left = margin + img_width + table_width + Inches(0.3) - - # Render images in left column (full height) - if images_to_render: - img_height = available_height - Inches(0.2) - self._addImagesToSlideInFrame(slide, images_to_render, styles, img_left, current_y, img_width, img_height) - - # Render tables in middle column (full height) - table_y = current_y - for table_section in table_sections: - elements = table_section.get("elements", []) - for element in elements: - if isinstance(element, dict) and element.get("type") == "table": - try: - self._addTableToSlide(slide, element, styles, table_y, max_width=table_width) - content = element.get("content", {}) - if isinstance(content, dict): - rows = content.get("rows", []) - num_rows = len(rows) + 1 - actual_height = Inches(0.4) * num_rows - table_y += actual_height + Inches(0.15) - else: - table_y += Inches(2) - except Exception as table_error: - logger.error(f"Error rendering table: {str(table_error)}") - break - - # Render text in right column (full height, adaptive font size) - if text_sections: - text_height = available_height - Inches(0.2) - self._renderTextSectionsInFrame(slide, text_sections, styles, text_left, current_y, text_width, text_height, adaptiveFontSize=True) - - # Layout 4: Images only - elif has_images and not has_text and not has_tables: - img_width = available_width * 0.8 - img_height = available_height * 0.8 - img_left = (available_width - img_width) / 2 + margin - self._addImagesToSlideInFrame(slide, images_to_render, styles, img_left, current_y, img_width, img_height) - - # Layout 5: Text only (default, adaptive font size) - elif has_text and not has_images and not has_tables: - text_height = available_height - Inches(0.2) - self._renderTextSectionsInFrame(slide, text_sections, styles, margin, current_y, available_width, text_height, adaptiveFontSize=True) - - # Layout 6: Tables only - elif has_tables and not has_images and not has_text: - table_height = available_height / max(len(table_sections), 1) - table_width = available_width - for table_section in table_sections: - elements = table_section.get("elements", []) - for element in elements: - if isinstance(element, dict) and element.get("type") == "table": - try: - self._addTableToSlide(slide, element, styles, current_y, max_width=table_width) - # Calculate actual table height - content = element.get("content", {}) - if isinstance(content, dict): - rows = content.get("rows", []) - num_rows = len(rows) + 1 # +1 for header - actual_height = min(Inches(0.4) * num_rows, table_height) - current_y += actual_height + Inches(0.2) - else: - current_y += table_height + Inches(0.2) - except Exception as table_error: - logger.error(f"Error rendering table: {str(table_error)}") - # Continue with next table - break - - except Exception as e: - logger.error(f"Error rendering slide content with frames: {str(e)}") - # Fallback to simple rendering - try: - content_shape = slide.placeholders[1] - text_frame = content_shape.text_frame - text_frame.clear() - except (AttributeError, IndexError): - from pptx.util import Inches - left = Inches(0.5) - top = Inches(1.5) - width = prs.slide_width - Inches(1) - height = prs.slide_height - top - Inches(0.5) - textbox = slide.shapes.add_textbox(left, top, width, height) - text_frame = textbox.text_frame - text_frame.word_wrap = True - - # Simple fallback rendering + contentTop = Inches(1.3) + availableWidth = prs.slide_width - Inches(1) + availableHeight = prs.slide_height - contentTop - Inches(0.3) + + # Create a single textbox for all non-table, non-image content + textbox = slide.shapes.add_textbox(margin, contentTop, availableWidth, availableHeight) + textFrame = textbox.text_frame + textFrame.word_wrap = True + textFrame.auto_size = None + for section in slide_sections: - self._renderSectionToTextFrame(slide, section, styles, text_frame, font_size_multiplier=1.0) + self._renderSectionToTextFrame(slide, section, styles, textFrame, font_size_multiplier=1.0) + + # Render standalone images that were passed alongside sections + if slide_images: + self._addImagesToSlideInFrame(slide, slide_images, styles, margin, contentTop, availableWidth, availableHeight) + + except Exception as e: + logger.error(f"Error rendering slide content: {str(e)}") def _renderTextSectionsInFrame(self, slide, text_sections: List[Dict[str, Any]], styles: Dict[str, Any], left: float, top: float, width: float, height: float, adaptiveFontSize: bool = False) -> None: """Render text sections (paragraphs, lists, headings) in a text frame.""" @@ -1935,6 +1723,14 @@ JSON ONLY. NO OTHER TEXT.""" except Exception as e: logger.warning(f"Error rendering text sections in frame: {str(e)}") + @staticmethod + def _isHorizontalRule(element: Dict[str, Any]) -> bool: + """Detect markdown horizontal rules (---, ***, ___) that should be skipped on slides.""" + content = element.get("content", {}) + text = content.get("text", "") if isinstance(content, dict) else (content if isinstance(content, str) else "") + stripped = text.strip() + return bool(stripped) and all(c in "-*_ " for c in stripped) and len(stripped.replace(" ", "")) >= 3 + def _renderSectionToTextFrame(self, slide, section: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None: """Render a single section to a text frame.""" try: @@ -1942,7 +1738,7 @@ JSON ONLY. NO OTHER TEXT.""" from pptx.enum.text import PP_ALIGN from pptx.dml.color import RGBColor - section_type = section.get("content_type", "paragraph") + sectionType = section.get("content_type", "paragraph") elements = section.get("elements", []) if not elements: @@ -1952,54 +1748,42 @@ JSON ONLY. NO OTHER TEXT.""" if not isinstance(element, dict): continue - element_type = element.get("type", "") - if not element_type: - element_type = section_type - - # Skip images - handled separately - if element_type == "image": + elementType = element.get("type", "") or sectionType + + if elementType == "image": + continue + + # Skip horizontal rules (---, ***, ___) + if elementType == "paragraph" and self._isHorizontalRule(element): continue - if element_type == "bullet_list" or element_type == "list": + if elementType == "table": + self._addTableToSlide(slide, element, styles) + elif elementType in ("bullet_list", "list"): self._addBulletListToSlide(slide, element, styles, text_frame, font_size_multiplier) - elif element_type == "heading": + elif elementType == "heading": self._addHeadingToSlide(slide, element, styles, text_frame, font_size_multiplier) - elif element_type == "paragraph": + elif elementType == "paragraph": self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier) - elif element_type == "code_block" or element_type == "code": + elif elementType in ("code_block", "code"): self._addCodeBlockToSlide(slide, element, styles, text_frame, font_size_multiplier) - elif element_type == "extracted_text": + elif elementType == "extracted_text": content = element.get("content", "") - source = element.get("source", "") if content: - paragraph_style = styles.get("paragraph", {}) p = text_frame.add_paragraph() - p.text = content - base_font_size = paragraph_style.get("font_size", 18) - p.font.size = Pt(int(base_font_size * font_size_multiplier)) - p.font.bold = paragraph_style.get("bold", False) - p.font.color.rgb = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))) + pStyle = styles.get("paragraph", {}) + fSize = Pt(max(10, int(pStyle.get("font_size", 14) * font_size_multiplier))) + fColor = RGBColor(*self._getSafeColor(pStyle.get("color", (47, 47, 47)))) + self._addMarkdownInlineRuns(p, content, fontSize=fSize, fontColor=fColor) p.alignment = PP_ALIGN.LEFT - if source: - p.add_run(f" (Source: {source})").font.italic = True - elif element_type == "reference": + elif elementType == "reference": label = element.get("label", "Reference") p = text_frame.add_paragraph() p.text = f"[Reference: {label}]" p.font.italic = True p.alignment = PP_ALIGN.LEFT else: - # Fallback to paragraph - content = element.get("content", "") - if isinstance(content, dict): - text = content.get("text", "") - elif isinstance(content, str): - text = content - else: - text = "" - - if text: - self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier=1.0) + self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier) except Exception as e: logger.warning(f"Error rendering section to text frame: {str(e)}") diff --git a/tests/unit/services/test_renderer_pdf_smoke.py b/tests/unit/services/test_renderer_pdf_smoke.py new file mode 100644 index 00000000..a3a3a78d --- /dev/null +++ b/tests/unit/services/test_renderer_pdf_smoke.py @@ -0,0 +1,253 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +""" +Smoke test: RendererPdf with every JSON section/element shape the pipeline supports. + +Canonical section types (datamodelJson.supportedSectionTypes): table, bullet_list, heading, +paragraph, code_block, image. + +PDF renderer additionally handles element types: reference, extracted_text (Phase 5D). +""" + +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from modules.serviceCenter.services.serviceGeneration.renderers.rendererPdf import ( + REPORTLAB_AVAILABLE, + RendererPdf, + _normalizePdfMonospaceText, + _prepareCodeBlockPlainText, +) + +# 1×1 transparent PNG +_MIN_PNG_B64 = ( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8BQDwAEhQGAhKmMIQAAAABJRU5ErkJggg==" +) + + +def _fakeServices(): + """RendererPdf calls services.utils.debugLogToFile; avoid None.""" + + def _noop(msg, tag=None): + pass + + return SimpleNamespace(utils=SimpleNamespace(debugLogToFile=_noop)) + + +def _fullDocumentJson() -> dict: + """One document covering all supported content_type values plus reference/extracted_text elements.""" + return { + "metadata": { + "split_strategy": "single_document", + "source_documents": [], + "extraction_method": "smoke_test", + "title": "PDF Renderer Smoke", + "language": "de", + }, + "documents": [ + { + "id": "doc_smoke", + "title": "PDF Renderer Smoke", + "filename": "pdf_renderer_smoke.pdf", + "sections": [ + { + "id": "sec_h1", + "content_type": "heading", + "order": 1, + "elements": [ + { + "content": { + "text": "H1 with **bold** and a very long subtitle line that should wrap cleanly without overlapping", + "level": 1, + } + } + ], + }, + { + "id": "sec_h2", + "content_type": "heading", + "order": 2, + "elements": [{"content": {"text": "H2 *italic* and `inline code`", "level": 2}}], + }, + { + "id": "sec_para", + "content_type": "paragraph", + "order": 3, + "elements": [ + { + "content": { + "text": ( + "Paragraph: **strong**, *emphasis*, __under-like bold__, " + "_single underscores_, and `var = 1`." + ) + } + } + ], + }, + { + "id": "sec_bullets", + "content_type": "bullet_list", + "order": 4, + "elements": [ + { + "content": { + "items": [ + "Bullet **one**", + {"text": "Bullet two with *italic*"}, + ], + "list_type": "bullet", + } + } + ], + }, + { + "id": "sec_numbered", + "content_type": "bullet_list", + "order": 5, + "elements": [ + { + "content": { + "items": [{"text": "First numbered"}, {"text": "Second **numbered**"}], + "list_type": "numbered", + } + } + ], + }, + { + "id": "sec_table", + "content_type": "table", + "order": 6, + "elements": [ + { + "content": { + "headers": ["Col A", "Col B", "Col C"], + "rows": [ + ["Short", "Medium length cell", "**Bold** in cell"], + ["R2", "Data", "`code`"], + ], + } + } + ], + }, + { + "id": "sec_code", + "content_type": "code_block", + "order": 7, + "elements": [ + { + "content": { + "language": "python", + "code": ( + 'def hello():\n print(" & ampersand")\n return 42\n' + "\n# tree (Unicode box drawing must not produce tofu in PDF)\n" + "Reports/\n\u251c\u2500\u2500 2025/\n\u2502 \u2514\u2500\u2500 file.txt\n" + ), + } + } + ], + }, + { + "id": "sec_image", + "content_type": "image", + "order": 8, + "elements": [ + { + "content": { + "base64Data": _MIN_PNG_B64, + "altText": "Smoke pixel", + "caption": "Minimal PNG (1×1)", + } + } + ], + }, + { + "id": "sec_reference", + "content_type": "paragraph", + "order": 9, + "elements": [ + { + "type": "reference", + "label": "External spec", + "documentReference": "urn:smoke:ref", + } + ], + }, + { + "id": "sec_extracted", + "content_type": "paragraph", + "order": 10, + "elements": [ + { + "type": "extracted_text", + "content": "Extracted **body** with formatting.", + "source": "fixture/source.md", + } + ], + }, + ], + } + ], + } + + +@pytest.mark.asyncio +async def test_renderer_pdf_all_json_elements(tmp_path): + if not REPORTLAB_AVAILABLE: + pytest.skip("reportlab is not installed") + renderer = RendererPdf(services=_fakeServices()) + payload = _fullDocumentJson() + docs = await renderer.render( + extractedContent=payload, + title="PDF_Renderer_Smoke", + userPrompt=None, + aiService=None, + ) + assert len(docs) == 1 + out = docs[0] + assert out.mimeType == "application/pdf" + assert out.documentData[:4] == b"%PDF" + assert out.filename.endswith(".pdf") + + outPath = tmp_path / "pdf_renderer_smoke.pdf" + outPath.write_bytes(out.documentData) + assert outPath.stat().st_size > 500 + + +def test_prepare_code_block_preserves_indentation_spaces(): + raw = "def x():\n return 1\n two leading on line" + assert " return" in _prepareCodeBlockPlainText(raw) + assert "\t" not in _prepareCodeBlockPlainText("a\tb") + + +def test_normalize_pdf_monospace_replaces_box_drawing(): + raw = "\u2500\u2502\u251c\u2514\u252c\nReports/\n" + norm = _normalizePdfMonospaceText(raw) + assert "\u2500" not in norm + assert "\u2502" not in norm + assert "Reports/" in norm + + +def test_pdf_heading_font_sizes_strictly_decrease(): + """H3 must not fall back to H1 styles (previous bug: ## smaller than ###).""" + renderer = RendererPdf(services=_fakeServices()) + styles = renderer._getDefaultStyleSet() + assert styles["heading1"]["font_size"] > styles["heading2"]["font_size"] > styles["heading3"]["font_size"] + assert renderer._defaultHeadingStyleDef(2)["font_size"] > renderer._defaultHeadingStyleDef(3)["font_size"] + if REPORTLAB_AVAILABLE: + s1 = renderer._createHeadingStyle(styles, 1).fontSize + s2 = renderer._createHeadingStyle(styles, 2).fontSize + s3 = renderer._createHeadingStyle(styles, 3).fontSize + assert s1 > s2 > s3 + partial = {"heading1": styles["heading1"], "heading2": styles["heading2"]} + assert renderer._createHeadingStyle(partial, 3).fontSize < renderer._createHeadingStyle(partial, 2).fontSize + + +def test_inline_code_angle_brackets_escaped_in_font_span(): + """Paths like `...//` must not break ReportLab XML inside Courier.""" + renderer = RendererPdf(services=_fakeServices()) + xml = renderer._markdownInlineToReportlabXml("unter `Eingabe//` speichern") + assert 'name="Courier"' in xml + assert "<Slug>" in xml