From 9186c60ad210459529bcb81e635b34e79ed7e4e8 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Sun, 22 Mar 2026 11:09:48 +0100
Subject: [PATCH] fixed rendering issues
---
modules/datamodels/datamodelDataSource.py | 7 +-
.../workspace/routeFeatureWorkspace.py | 33 +-
modules/interfaces/interfaceAiObjects.py | 2 +
.../services/serviceAgent/agentLoop.py | 40 +-
.../services/serviceAgent/mainServiceAgent.py | 88 ++-
.../services/serviceChat/mainServiceChat.py | 3 +-
.../renderers/rendererDocx.py | 68 +-
.../renderers/rendererPdf.py | 411 +++++++---
.../renderers/rendererPptx.py | 740 +++++++-----------
.../unit/services/test_renderer_pdf_smoke.py | 253 ++++++
10 files changed, 1011 insertions(+), 634 deletions(-)
create mode 100644 tests/unit/services/test_renderer_pdf_smoke.py
diff --git a/modules/datamodels/datamodelDataSource.py b/modules/datamodels/datamodelDataSource.py
index 86e0c7ec..f8238fab 100644
--- a/modules/datamodels/datamodelDataSource.py
+++ b/modules/datamodels/datamodelDataSource.py
@@ -19,7 +19,11 @@ class DataSource(BaseModel):
connectionId: str = Field(description="FK to UserConnection")
sourceType: str = Field(description="sharepointFolder, googleDriveFolder, outlookFolder, ftpFolder")
path: str = Field(description="External path (e.g. '/sites/MySite/Documents/Reports')")
- label: str = Field(description="User-visible label")
+ label: str = Field(description="User-visible label (often the last path segment)")
+ displayPath: Optional[str] = Field(
+ default=None,
+ description="Human-readable full path for UI (connection-relative, slash-separated)",
+ )
featureInstanceId: Optional[str] = Field(default=None, description="Scoped to feature instance")
mandateId: Optional[str] = Field(default=None, description="Mandate scope")
userId: str = Field(default="", description="Owner user ID")
@@ -37,6 +41,7 @@ registerModelLabels(
"sourceType": {"en": "Source Type", "de": "Quellentyp", "fr": "Type de source"},
"path": {"en": "Path", "de": "Pfad", "fr": "Chemin"},
"label": {"en": "Label", "de": "Bezeichnung", "fr": "Libellé"},
+ "displayPath": {"en": "Display path", "de": "Anzeigepfad", "fr": "Chemin affiché"},
"featureInstanceId": {"en": "Feature Instance", "de": "Feature-Instanz", "fr": "Instance de fonctionnalité"},
"mandateId": {"en": "Mandate ID", "de": "Mandanten-ID", "fr": "ID du mandat"},
"userId": {"en": "User ID", "de": "Benutzer-ID", "fr": "ID utilisateur"},
diff --git a/modules/features/workspace/routeFeatureWorkspace.py b/modules/features/workspace/routeFeatureWorkspace.py
index cf8efc04..d0dd22da 100644
--- a/modules/features/workspace/routeFeatureWorkspace.py
+++ b/modules/features/workspace/routeFeatureWorkspace.py
@@ -1139,6 +1139,7 @@ class CreateDataSourceRequest(BaseModel):
sourceType: str = Field(description="Source type")
path: str = Field(description="Path")
label: str = Field(description="Label")
+ displayPath: Optional[str] = Field(default=None, description="Full human-readable path for tooltips")
@router.post("/{instanceId}/datasources")
@@ -1165,6 +1166,7 @@ async def createWorkspaceDataSource(
path=body.path,
label=body.label,
featureInstanceId=instanceId,
+ displayPath=body.displayPath,
)
return JSONResponse(dataSource if isinstance(dataSource, dict) else dataSource.model_dump())
@@ -1214,7 +1216,7 @@ async def listFeatureConnections(
userMandates = rootIf.getUserMandates(userId)
if not userMandates:
- return JSONResponse({"featureConnections": []})
+ return JSONResponse({"featureConnectionsByMandate": []})
mandateLabels: dict = {}
for um in userMandates:
@@ -1226,7 +1228,7 @@ async def listFeatureConnections(
except Exception:
mandateLabels[um.mandateId] = um.mandateId
- items = []
+ byMandate: dict = {}
seenIds: set = set()
for um in userMandates:
allInstances = rootIf.getFeatureInstancesByMandate(um.mandateId)
@@ -1244,20 +1246,33 @@ async def listFeatureConnections(
featureDef = catalog.getFeatureDefinition(inst.featureCode) or {}
dataObjects = catalog.getDataObjects(inst.featureCode)
- mLabel = mandateLabels.get(inst.mandateId, "")
label = inst.label or inst.featureCode
- if mLabel:
- label = f"{label} ({mLabel})"
- items.append({
+ mid = inst.mandateId
+ connItem = {
"featureInstanceId": inst.id,
"featureCode": inst.featureCode,
- "mandateId": inst.mandateId,
+ "mandateId": mid,
"label": label,
"icon": featureDef.get("icon", "mdi-database"),
"tableCount": len(dataObjects),
- })
+ }
+ if mid not in byMandate:
+ byMandate[mid] = []
+ byMandate[mid].append(connItem)
- return JSONResponse({"featureConnections": items})
+ def _sortKeyLabel(x: dict) -> str:
+ return (x.get("label") or "").lower()
+
+ groups = []
+ for mid in sorted(byMandate.keys(), key=lambda m: (mandateLabels.get(m, m) or "").lower()):
+ conns = sorted(byMandate[mid], key=_sortKeyLabel)
+ groups.append({
+ "mandateId": mid,
+ "mandateLabel": mandateLabels.get(mid, mid),
+ "featureConnections": conns,
+ })
+
+ return JSONResponse({"featureConnectionsByMandate": groups})
@router.get("/{instanceId}/feature-connections/{fiId}/tables")
diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py
index 981a6b46..f0aedc87 100644
--- a/modules/interfaces/interfaceAiObjects.py
+++ b/modules/interfaces/interfaceAiObjects.py
@@ -332,6 +332,7 @@ class AiObjects:
errorCount=0,
toolCalls=responseToolCalls
)
+ response._modelMaxTokens = model.maxTokens
if self.billingCallback:
try:
@@ -470,6 +471,7 @@ class AiObjects:
errorCount=0,
toolCalls=responseToolCalls,
)
+ response._modelMaxTokens = model.maxTokens
if self.billingCallback:
try:
diff --git a/modules/serviceCenter/services/serviceAgent/agentLoop.py b/modules/serviceCenter/services/serviceAgent/agentLoop.py
index 69fe31b2..bee03424 100644
--- a/modules/serviceCenter/services/serviceAgent/agentLoop.py
+++ b/modules/serviceCenter/services/serviceAgent/agentLoop.py
@@ -276,6 +276,7 @@ async def runAgentLoop(
"userId": userId,
"featureInstanceId": featureInstanceId,
"mandateId": mandateId,
+ "modelMaxOutputTokens": getattr(aiResponse, "_modelMaxTokens", None) or 0,
})
state.totalToolCalls += len(results)
@@ -439,6 +440,29 @@ def _repairTruncatedJson(raw: str) -> Optional[Dict[str, Any]]:
return None
+def _validateRepairedToolArgs(toolName: str, args: Dict[str, Any]) -> Optional[str]:
+ """After closeJsonStructures + json.loads, args can be syntactically valid but useless (truncation
+ cut off before required fields). Return a user-facing _parseError message, or None if OK.
+
+ Without this, renderDocument runs with missing `content` and only returns \"content is required\",
+ hiding the real cause (output token limit).
+ """
+ if toolName == "renderDocument":
+ content = args.get("content")
+ sourceFileId = args.get("sourceFileId")
+ hasInline = isinstance(content, str) and bool(content.strip())
+ hasFile = isinstance(sourceFileId, str) and bool(sourceFileId.strip())
+ if not hasInline and not hasFile:
+ return (
+ "Your tool call JSON was repaired after truncation, but neither `content` nor `sourceFileId` is usable. "
+ "Large documents must not be inlined in the tool call (output limit).\n"
+ "Preferred: writeFile(mode='create') + writeFile(mode='append') to build a .md file, then "
+ "renderDocument(sourceFileId=, outputFormat='pdf', title='...') — the tool call stays small.\n"
+ "Alternatives: replaceInFile for edits; shorter outline first."
+ )
+ return None
+
+
def _parseToolCalls(aiResponse: AiCallResponse) -> List[ToolCallRequest]:
"""Parse tool calls from AI response. Supports native function calling and text-based fallback."""
toolCalls = []
@@ -457,14 +481,20 @@ def _parseToolCalls(aiResponse: AiCallResponse) -> List[ToolCallRequest]:
logger.warning(f"Unrecoverable truncated JSON for '{tc['function']['name']}': {rawArgs[:200]}")
parsedArgs = {"_parseError": (
"Your tool call arguments were truncated (output cut off by token limit). "
- "The content is too large for a single tool call. Strategies:\n"
- "1. For new files: use writeFile(mode='create') with the first part, "
- "then writeFile(fileId=..., mode='append') for subsequent parts (~8000 chars each).\n"
- "2. For editing existing files: use replaceInFile to change only the specific parts.\n"
- "3. For documentation: split into multiple smaller files."
+ "Do not put the full document body in renderDocument JSON.\n"
+ "1. writeFile(create) + writeFile(append) to a .md file, then "
+ "renderDocument(sourceFileId=, outputFormat=..., title=...) — tiny tool call.\n"
+ "2. Or replaceInFile for targeted edits.\n"
+ "3. Or split into multiple smaller files."
)}
else:
logger.info(f"Repaired truncated JSON for '{tc['function']['name']}'")
+ repairIssue = _validateRepairedToolArgs(tc["function"]["name"], parsedArgs)
+ if repairIssue:
+ logger.warning(
+ f"Repaired JSON for '{tc['function']['name']}' still invalid for execution: {repairIssue[:80]}..."
+ )
+ parsedArgs = {"_parseError": repairIssue}
else:
parsedArgs = rawArgs if rawArgs else {}
toolCalls.append(ToolCallRequest(
diff --git a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py
index cec64813..03b8598e 100644
--- a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py
+++ b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py
@@ -259,7 +259,9 @@ class AgentService:
"Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, "
"or `describeImage(fileId)` for image analysis.\n"
"For folders, use `listFiles(folderId)` to get the files inside, then `readFile(fileId)` for each.\n"
- "When generating documents with `renderDocument`, embed images using `` in the markdown content.\n\n"
+ "For large PDFs/DOCX, avoid huge `renderDocument` tool JSON: build markdown with "
+ "`writeFile` (create + append), then `renderDocument(sourceFileId=that file id, outputFormat=...)`.\n"
+ "For small docs you may pass `content` inline. Embed images with `` in markdown.\n\n"
)
header += "\n\n".join(fileDescriptions)
return f"{header}\n\n---\n\nUser request: {prompt}"
@@ -2209,13 +2211,75 @@ def _registerCoreTools(registry: ToolRegistry, services):
async def _renderDocument(args: Dict[str, Any], context: Dict[str, Any]):
"""Render agent-produced markdown content into any document format via the RendererRegistry."""
import re as _re
+ sourceFileId = (args.get("sourceFileId") or "").strip()
content = args.get("content", "")
+ if not isinstance(content, str):
+ content = str(content) if content is not None else ""
outputFormat = args.get("outputFormat", "pdf")
title = args.get("title", "Document")
language = args.get("language", "de")
- if not content:
- return ToolResult(toolCallId="", toolName="renderDocument", success=False, error="content is required")
+ if sourceFileId:
+ try:
+ dbMgmt = services.chat.interfaceDbComponent
+ fileRow = dbMgmt.getFile(sourceFileId)
+ if not fileRow:
+ return ToolResult(
+ toolCallId="",
+ toolName="renderDocument",
+ success=False,
+ error=f"sourceFileId not found: {sourceFileId}",
+ )
+ rawBytes = dbMgmt.getFileData(sourceFileId)
+ if not rawBytes:
+ return ToolResult(
+ toolCallId="",
+ toolName="renderDocument",
+ success=False,
+ error=f"sourceFileId has no data: {sourceFileId}",
+ )
+ try:
+ content = rawBytes.decode("utf-8")
+ except UnicodeDecodeError:
+ content = rawBytes.decode("latin-1", errors="replace")
+ except Exception as e:
+ return ToolResult(
+ toolCallId="",
+ toolName="renderDocument",
+ success=False,
+ error=f"Could not read sourceFileId: {e}",
+ )
+
+ if not (content or "").strip():
+ return ToolResult(
+ toolCallId="",
+ toolName="renderDocument",
+ success=False,
+ error=(
+ "Provide non-empty `content` (markdown) or `sourceFileId` (id of a .md/.txt from writeFile). "
+ "For long documents use writeFile create+append, then renderDocument(sourceFileId=...)."
+ ),
+ )
+
+ modelMaxTokens = context.get("modelMaxOutputTokens", 0)
+ _inlineCharLimit = int(modelMaxTokens * 3 * 0.5) if modelMaxTokens > 0 else 6000
+ _inlineCharLimit = max(_inlineCharLimit, 3000)
+
+ if not sourceFileId and len(content) > _inlineCharLimit:
+ return ToolResult(
+ toolCallId="",
+ toolName="renderDocument",
+ success=False,
+ error=(
+ f"Inline `content` is {len(content)} chars — over the {_inlineCharLimit} char limit "
+ f"(derived from model output budget of {modelMaxTokens} tokens). "
+ "Large documents must use the file path:\n"
+ "1. writeFile(mode='create', name='draft.md', content=)\n"
+ "2. writeFile(mode='append', fileId=, content=) — repeat as needed\n"
+ "3. renderDocument(sourceFileId=, outputFormat='pdf', title='...')\n"
+ "This avoids output truncation entirely."
+ ),
+ )
try:
structuredContent = _markdownToDocumentJson(content, title, language)
@@ -2321,20 +2385,26 @@ def _registerCoreTools(registry: ToolRegistry, services):
registry.register(
"renderDocument", _renderDocument,
description=(
- "Render markdown content into a document file (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT). "
- "You write the full document content as markdown, then this tool converts and renders it. "
- "To embed images from uploaded files, use markdown image syntax with the file ID: . "
- "The images will be resolved from the Knowledge Store and embedded in the output document."
+ "Render markdown into a document file (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT). "
+ "For long documents: write markdown with writeFile (mode=create then append chunks), then call this tool with "
+ "`sourceFileId` only (tiny JSON — avoids model output truncation). For short docs you may pass `content` inline. "
+ "Images:  in the markdown."
),
parameters={
"type": "object",
"properties": {
- "content": {"type": "string", "description": "Full document content as markdown (headings, tables, lists, code blocks, paragraphs, images via )"},
+ "content": {
+ "type": "string",
+ "description": "Full markdown inline. Prefer `sourceFileId` when the document is large (many KB).",
+ },
+ "sourceFileId": {
+ "type": "string",
+ "description": "Chat file id of markdown saved via writeFile (create+append). Use this instead of `content` for long PDFs.",
+ },
"outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"},
"title": {"type": "string", "description": "Document title", "default": "Document"},
"language": {"type": "string", "description": "Document language (ISO 639-1)", "default": "de"},
},
- "required": ["content"],
},
readOnly=False,
)
diff --git a/modules/serviceCenter/services/serviceChat/mainServiceChat.py b/modules/serviceCenter/services/serviceChat/mainServiceChat.py
index 3ec1c504..5cc1eb66 100644
--- a/modules/serviceCenter/services/serviceChat/mainServiceChat.py
+++ b/modules/serviceCenter/services/serviceChat/mainServiceChat.py
@@ -508,7 +508,7 @@ class ChatService:
def createDataSource(
self, connectionId: str, sourceType: str, path: str, label: str,
- featureInstanceId: str = None
+ featureInstanceId: str = None, displayPath: str = None,
) -> Dict[str, Any]:
"""Create a new external data source reference."""
from modules.datamodels.datamodelDataSource import DataSource
@@ -517,6 +517,7 @@ class ChatService:
sourceType=sourceType,
path=path,
label=label,
+ displayPath=displayPath,
featureInstanceId=featureInstanceId or self._context.feature_instance_id or "",
mandateId=self._context.mandate_id or "",
userId=self.user.id if self.user else "",
diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py
index 850f6aa8..733b9ade 100644
--- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py
+++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py
@@ -281,7 +281,7 @@ class RendererDocx(BaseRenderer):
def _getDefaultStyleSet(self) -> Dict[str, Any]:
"""Default DOCX style set - used when no style instructions present."""
return {
- "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"},
+ "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "left"},
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"},
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"},
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"},
@@ -349,11 +349,11 @@ class RendererDocx(BaseRenderer):
para.runs[0].italic = True
continue
elif element_type == "extracted_text":
- # Extracted text format - render as paragraph
content = element.get("content", "")
source = element.get("source", "")
if content:
- para = doc.add_paragraph(content)
+ para = doc.add_paragraph()
+ self._addMarkdownInlineRuns(para, content)
if source:
para.add_run(f" (Source: {source})").italic = True
continue
@@ -406,6 +406,37 @@ class RendererDocx(BaseRenderer):
# Add error paragraph as fallback
error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]")
+ # ── Markdown inline → python-docx runs ──────────────────────────────
+ _MD_INLINE_RE = re.compile(
+ r"(\*\*(.+?)\*\*)" # group 1,2: bold
+ r"|(__(.+?)__)" # group 3,4: bold (underscore)
+ r"|(? None:
+ """Parse markdown inline formatting and add corresponding Runs to a python-docx paragraph."""
+ pos = 0
+ for m in self._MD_INLINE_RE.finditer(text):
+ if m.start() > pos:
+ paragraph.add_run(text[pos:m.start()])
+ if m.group(2):
+ paragraph.add_run(m.group(2)).bold = True
+ elif m.group(4):
+ paragraph.add_run(m.group(4)).bold = True
+ elif m.group(5):
+ paragraph.add_run(m.group(5)).italic = True
+ elif m.group(6):
+ paragraph.add_run(m.group(6)).italic = True
+ elif m.group(7):
+ run = paragraph.add_run(m.group(7))
+ run.font.name = "Courier New"
+ run.font.size = Pt(9)
+ pos = m.end()
+ if pos < len(text):
+ paragraph.add_run(text[pos:])
+
def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""
Render a JSON table to DOCX using AI-generated styles.
@@ -480,9 +511,8 @@ class RendererDocx(BaseRenderer):
tblW.set(qn('w:w'), '0')
tblPr.append(tblW)
- # Center alignment
jc = OxmlElement('w:jc')
- jc.set(qn('w:val'), 'center')
+ jc.set(qn('w:val'), 'left')
tblPr.append(jc)
# Apply table borders directly (works without template styles)
@@ -821,10 +851,11 @@ class RendererDocx(BaseRenderer):
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
for item in items:
- if isinstance(item, str):
- para = doc.add_paragraph(item, style='List Bullet')
- elif isinstance(item, dict) and "text" in item:
- para = doc.add_paragraph(item["text"], style='List Bullet')
+ itemText = item if isinstance(item, str) else (item.get("text", "") if isinstance(item, dict) else "")
+ if not itemText:
+ continue
+ para = doc.add_paragraph(style='List Bullet')
+ self._addMarkdownInlineRuns(para, itemText)
# Apply bullet list styling from style set - use cached objects
if bullet_style and para.runs:
@@ -849,7 +880,6 @@ class RendererDocx(BaseRenderer):
def _renderJsonHeading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON heading to DOCX using AI-generated styles."""
try:
- # Extract from nested content structure
content = heading_data.get("content", {})
if not isinstance(content, dict):
return
@@ -858,13 +888,13 @@ class RendererDocx(BaseRenderer):
if text:
level = max(1, min(6, level))
- # Use custom heading style if available, otherwise use built-in
- style_name = f"Heading {level}" if level <= 2 else "Heading 1"
+ # python-docx supports Heading 1 – Heading 9 as built-in styles
try:
- para = doc.add_paragraph(text, style=style_name)
- except KeyError:
- # Fallback to built-in heading if custom style doesn't exist
- doc.add_heading(text, level=level)
+ para = doc.add_heading("", level=level)
+ para.clear()
+ self._addMarkdownInlineRuns(para, text)
+ except (KeyError, ValueError):
+ para = doc.add_paragraph(text)
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
@@ -893,8 +923,8 @@ class RendererDocx(BaseRenderer):
return
if text:
- para = doc.add_paragraph(text)
- # Apply paragraph styling from style set - OPTIMIZED: pre-calculate style objects
+ para = doc.add_paragraph()
+ self._addMarkdownInlineRuns(para, text)
paragraph_style = styles.get("paragraph", {})
if paragraph_style:
# Pre-calculate and cache style objects
@@ -1345,7 +1375,7 @@ class RendererDocx(BaseRenderer):
# Create table
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
- table.alignment = WD_TABLE_ALIGNMENT.CENTER
+ table.alignment = WD_TABLE_ALIGNMENT.LEFT
# Add data to table
for row_idx, row_data in enumerate(table_data):
diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py
index 6cbc8a9c..a5c9dc93 100644
--- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py
+++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py
@@ -4,6 +4,10 @@
PDF renderer for report generation using reportlab.
"""
+from __future__ import annotations
+
+import unicodedata
+
from .documentRendererBaseTemplate import BaseRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional
@@ -11,8 +15,8 @@ import io
import base64
try:
- from reportlab.lib.pagesizes import letter, A4
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
+ from reportlab.lib.pagesizes import A4
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Preformatted
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
@@ -21,6 +25,53 @@ try:
except ImportError:
REPORTLAB_AVAILABLE = False
+import re as _re_pdf
+
+# A4 width in pt; margins must match SimpleDocTemplate(leftMargin/rightMargin)
+_PDF_MARGIN_LR_PT = 72.0
+_PDF_A4_WIDTH_PT = 595.27
+_PDF_CONTENT_WIDTH_PT = _PDF_A4_WIDTH_PT - (2 * _PDF_MARGIN_LR_PT)
+
+
+def _boxDrawingCharToAscii(ch: str) -> str:
+ """Map one box-drawing character to ASCII (Courier has no glyphs for U+2500–U+257F)."""
+ nm = unicodedata.name(ch, "")
+ v = "VERTICAL" in nm
+ h = "HORIZONTAL" in nm
+ and_ = "AND" in nm
+ if v and h:
+ return "+"
+ if v and not h and not and_:
+ return "|"
+ if h and not v and not and_:
+ return "-"
+ return "+"
+
+
+def _normalizePdfMonospaceText(text: str) -> str:
+ """Replace Unicode box/block drawing with ASCII so PDF core fonts render readable code/trees."""
+ if not text:
+ return ""
+ out: List[str] = []
+ for ch in text:
+ o = ord(ch)
+ if 0x2500 <= o <= 0x257F:
+ out.append(_boxDrawingCharToAscii(ch))
+ elif 0x2580 <= o <= 0x259F:
+ out.append("#")
+ else:
+ out.append(ch)
+ return "".join(out)
+
+
+def _prepareCodeBlockPlainText(text: str) -> str:
+ """Normalize newlines/tabs for preformatted code (no HTML/XML; spaces must stay significant)."""
+ if not text:
+ return ""
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
+ return text.expandtabs(4)
+
+
class RendererPdf(BaseRenderer):
"""Renders content to PDF format using reportlab."""
@@ -122,15 +173,6 @@ class RendererPdf(BaseRenderer):
# Extract sections and metadata from standardized schema
sections = self._extractSections(json_content)
- metadata = self._extractMetadata(json_content)
-
- # Use provided title (which comes from documents[].title) as primary source
- # Fallback to metadata.title only if title parameter is empty
- document_title = title if title else metadata.get("title", "Generated Document")
-
- # Make title shorter to prevent wrapping/overlapping
- if len(document_title) > 40:
- document_title = "PowerOn - Consent Agreement"
# Create a buffer to hold the PDF
buffer = io.BytesIO()
@@ -145,17 +187,9 @@ class RendererPdf(BaseRenderer):
bottomMargin=18
)
- # Build PDF content
+ # Build PDF content (no cover page — body starts on page 1; filename still uses `title`)
story = []
- # Title page
- title_style = self._createTitleStyle(styles)
- story.append(Paragraph(document_title, title_style))
- story.append(Spacer(1, 50)) # Increased spacing to prevent overlap
- story.append(Paragraph(f"Generated: {self._formatTimestamp()}", self._createNormalStyle(styles)))
- story.append(Spacer(1, 30)) # Add spacing before page break
- story.append(PageBreak())
-
# Process each section (sections already extracted above)
self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER")
for i, section in enumerate(sections):
@@ -164,10 +198,9 @@ class RendererPdf(BaseRenderer):
self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER")
story.extend(section_elements)
- # Build PDF
- doc.build(story)
+ # Build PDF — retry with oversized flowables removed on LayoutError
+ self._buildPdfWithOverflowGuard(doc, story, buffer)
- # Get PDF content as base64
buffer.seek(0)
pdf_bytes = buffer.getvalue()
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
@@ -177,6 +210,42 @@ class RendererPdf(BaseRenderer):
except Exception as e:
self.logger.error(f"Error generating PDF from JSON: {str(e)}")
raise Exception(f"PDF generation failed: {str(e)}")
+
+ def _buildPdfWithOverflowGuard(self, doc, story: List[Any], buffer) -> None:
+ """Try doc.build(); on 'too large on page' LayoutError, drop the offending
+ flowable, log a warning, and retry (up to 5 times)."""
+ maxRetries = 5
+ for attempt in range(maxRetries + 1):
+ try:
+ buffer.seek(0)
+ buffer.truncate()
+ doc.build(story)
+ return
+ except Exception as e:
+ msg = str(e)
+ if "too large on page" not in msg or attempt == maxRetries:
+ raise
+ # Identify the offending flowable from the error repr
+ self.logger.warning(f"PDF overflow (attempt {attempt + 1}): {msg} — removing oversized element and retrying")
+ removed = False
+ for idx, flowable in enumerate(story):
+ fRepr = repr(flowable)
+ if "Table" in fRepr and hasattr(flowable, '_cellvalues'):
+ try:
+ nRows = len(flowable._cellvalues)
+ nCols = len(flowable._cellvalues[0]) if flowable._cellvalues else 0
+ if nRows == 1 and nCols == 1:
+ errPara = Paragraph(
+ "[Code block omitted — content too large for PDF page]",
+ self._createNormalStyle({}),
+ )
+ story[idx] = errPara
+ removed = True
+ break
+ except Exception:
+ pass
+ if not removed:
+ raise
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
"""Get style set - use styles from document generation metadata if available,
@@ -269,13 +338,18 @@ class RendererPdf(BaseRenderer):
"""Default PDF style set - used when no style instructions present."""
return {
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30},
+ # Markdown #..###### — sizes must strictly decrease (H1 largest … H6 smallest).
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12},
- "heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8},
+ "heading2": {"font_size": 15, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 10, "space_before": 10},
+ "heading3": {"font_size": 13, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8},
+ "heading4": {"font_size": 12, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 6, "space_before": 6},
+ "heading5": {"font_size": 11, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 6, "space_before": 6},
+ "heading6": {"font_size": 10, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 4, "space_before": 4},
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2},
- "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center", "font_size": 12},
+ "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "left", "font_size": 12},
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10},
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3},
- "code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6}
+ "code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6, "align": "left"}
}
async def _getAiStylesWithPdfColors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
@@ -441,39 +515,35 @@ class RendererPdf(BaseRenderer):
return color_value
return default
-
- def _createTitleStyle(self, styles: Dict[str, Any]) -> ParagraphStyle:
- """Create title style from style definitions."""
- title_style_def = styles.get("title", {})
-
- # DEBUG: Show what color and spacing is being used for title
- title_color = title_style_def.get("color", "#1F4E79")
- title_space_after = title_style_def.get("space_after", 30)
- self.services.utils.debugLogToFile(f"PDF TITLE COLOR: {title_color} -> {self._hexToColor(title_color)}", "PDF_RENDERER")
- self.services.utils.debugLogToFile(f"PDF TITLE SPACE_AFTER: {title_space_after}", "PDF_RENDERER")
-
- return ParagraphStyle(
- 'CustomTitle',
- fontSize=title_style_def.get("font_size", 20), # Reduced from 24 to 20
- spaceAfter=title_style_def.get("space_after", 30),
- alignment=self._getAlignment(title_style_def.get("align", "center")),
- textColor=self._hexToColor(title_color),
- leading=title_style_def.get("font_size", 20) * 1.4, # Add line spacing for multi-line titles
- spaceBefore=0 # Ensure no space before title
- )
-
+ def _defaultHeadingStyleDef(self, level: int) -> Dict[str, Any]:
+ """When heading{N} is missing from styles, never fall back to heading1 (that made H3 > H2)."""
+ sizes = {1: 18, 2: 15, 3: 13, 4: 12, 5: 11, 6: 10}
+ fs = sizes.get(level, 10)
+ sb = max(4, 14 - level)
+ return {
+ "font_size": fs,
+ "color": "#2F2F2F" if level <= 2 else "#4F4F4F",
+ "bold": True,
+ "align": "left",
+ "space_after": sb,
+ "space_before": sb,
+ }
+
def _createHeadingStyle(self, styles: Dict[str, Any], level: int) -> ParagraphStyle:
"""Create heading style from style definitions."""
heading_key = f"heading{level}"
- heading_style_def = styles.get(heading_key, styles.get("heading1", {}))
-
+ heading_style_def = styles.get(heading_key) or self._defaultHeadingStyleDef(level)
+ fs = heading_style_def.get("font_size", self._defaultHeadingStyleDef(level)["font_size"])
+ bold = heading_style_def.get("bold", True)
return ParagraphStyle(
f'CustomHeading{level}',
- fontSize=heading_style_def.get("font_size", 18 - level * 2),
+ fontName="Helvetica-Bold" if bold else "Helvetica",
+ fontSize=fs,
spaceAfter=heading_style_def.get("space_after", 12),
spaceBefore=heading_style_def.get("space_before", 12),
alignment=self._getAlignment(heading_style_def.get("align", "left")),
- textColor=self._hexToColor(heading_style_def.get("color", "#2F2F2F"))
+ textColor=self._hexToColor(heading_style_def.get("color", "#2F2F2F")),
+ leading=fs * 1.35,
)
def _createNormalStyle(self, styles: Dict[str, Any]) -> ParagraphStyle:
@@ -505,22 +575,6 @@ class RendererPdf(BaseRenderer):
}
return align_map.get(align.lower().strip(), TA_LEFT)
- def _getTableAlignment(self, align: str) -> str:
- """Convert alignment string to ReportLab table alignment string."""
- if not align or not isinstance(align, str):
- return 'LEFT'
-
- align_map = {
- "center": 'CENTER',
- "left": 'LEFT',
- "justify": 'LEFT', # Tables don't support justify, use LEFT
- "right": 'RIGHT',
- "0": 'LEFT', # Handle numeric strings
- "1": 'CENTER',
- "2": 'LEFT' # Tables don't support justify, use LEFT
- }
- return align_map.get(align.lower().strip(), 'LEFT')
-
def _hexToColor(self, hex_color: str) -> colors.Color:
"""Convert hex color to reportlab color."""
try:
@@ -542,7 +596,66 @@ class RendererPdf(BaseRenderer):
return colors.black
except:
return colors.black
-
+
+ def _escapeReportlabXml(self, text: str) -> str:
+ """Escape text for ReportLab Paragraph markup."""
+ if not text:
+ return ""
+ return (
+ text.replace("&", "&")
+ .replace("<", "<")
+ .replace(">", ">")
+ )
+
+ def _applyInlineMarkdownToEscapedPlain(self, text: str) -> str:
+ """Escape XML then apply bold/italic to a segment with no `code` spans (code is handled separately)."""
+ if not text:
+ return ""
+ s = self._escapeReportlabXml(text)
+ s = _re_pdf.sub(r"\*\*(.+?)\*\*", r"\1", s, flags=_re_pdf.DOTALL)
+ s = _re_pdf.sub(r"__(.+?)__", r"\1", s, flags=_re_pdf.DOTALL)
+ s = _re_pdf.sub(r"(?\1", s)
+ s = _re_pdf.sub(r"(?\1", s)
+ return s
+
+ def _markdownInlineToReportlabXml(self, text: str) -> str:
+ """Turn common markdown inline (**bold**, *italic*, `code`) into ReportLab XML.
+ Backtick spans are extracted first so paths like `...//...` are not corrupted by
+ markdown patterns and XML escaping stays well-formed inside .
+ """
+ if not text:
+ return ""
+ text = _normalizePdfMonospaceText(text)
+ out: List[str] = []
+ pos = 0
+ for m in _re_pdf.finditer(r"`([^`]*)`", text):
+ before = text[pos:m.start()]
+ out.append(self._applyInlineMarkdownToEscapedPlain(before))
+ code = m.group(1)
+ out.append(f'{self._escapeReportlabXml(code)}')
+ pos = m.end()
+ out.append(self._applyInlineMarkdownToEscapedPlain(text[pos:]))
+ return "".join(out)
+
+ def _paragraphFromInlineMarkdown(self, text: str, style: ParagraphStyle) -> Paragraph:
+ return Paragraph(self._markdownInlineToReportlabXml(text), style)
+
+ def _createTableCellParagraphStyle(
+ self, styles: Dict[str, Any], *, header: bool, tableStyleKey: str
+ ) -> ParagraphStyle:
+ """Paragraph style for table cells (word wrap within colWidth)."""
+ tdef = styles.get(tableStyleKey, {})
+ fs = tdef.get("font_size", 12 if header else 10)
+ defaultTc = "#FFFFFF" if header else "#2F2F2F"
+ return ParagraphStyle(
+ f"TblCell{'H' if header else 'B'}{tableStyleKey}",
+ fontSize=fs,
+ leading=fs * 1.25,
+ alignment=TA_LEFT,
+ textColor=self._hexToColor(tdef.get("text_color", defaultTc)),
+ fontName="Helvetica-Bold" if header and tdef.get("bold", True) else "Helvetica",
+ )
+
def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a single JSON section to PDF elements using AI-generated styles.
Supports three content formats: reference, object (base64), extracted_text.
@@ -575,8 +688,10 @@ class RendererPdf(BaseRenderer):
content = element.get("content", "")
source = element.get("source", "")
if content:
- source_text = f" (Source: {source})" if source else ""
- all_elements.append(Paragraph(f"{content}{source_text}", self._createNormalStyle(styles)))
+ bodyXml = self._markdownInlineToReportlabXml(content)
+ if source:
+ bodyXml = f"{bodyXml} (Source: {self._escapeReportlabXml(source)})"
+ all_elements.append(Paragraph(bodyXml, self._createNormalStyle(styles)))
all_elements.append(Spacer(1, 6))
continue
@@ -618,10 +733,8 @@ class RendererPdf(BaseRenderer):
return [Paragraph(f"[Error rendering section: {str(e)}]", self._createNormalStyle(styles))]
def _renderJsonTable(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
- """Render a JSON table to PDF elements using AI-generated styles."""
+ """Render a JSON table: left-aligned, width capped to printable area, cells wrap."""
try:
- # Handle nested content structure: element.content.headers vs element.headers
- # Extract from nested content structure
content = table_data.get("content", {})
if not isinstance(content, dict):
return []
@@ -631,30 +744,43 @@ class RendererPdf(BaseRenderer):
if not headers or not rows:
return []
- # Prepare table data
- table_data_list = [headers] + rows
-
- # Create table
- table = Table(table_data_list)
-
- # Apply styling
+ numCols = len(headers)
+ colWidth = _PDF_CONTENT_WIDTH_PT / max(numCols, 1)
+ colWidths = [colWidth] * numCols
+
+ hdrPs = self._createTableCellParagraphStyle(styles, header=True, tableStyleKey="table_header")
+ cellPs = self._createTableCellParagraphStyle(styles, header=False, tableStyleKey="table_cell")
+
+ def _cellPara(val, ps):
+ return self._paragraphFromInlineMarkdown(str(val) if val is not None else "", ps)
+
+ headerRow = [_cellPara(h, hdrPs) for h in headers]
+ bodyRows = []
+ for row in rows:
+ padded = list(row) + [""] * max(0, numCols - len(row))
+ padded = padded[:numCols]
+ bodyRows.append([_cellPara(c, cellPs) for c in padded])
+
+ table_matrix = [headerRow] + bodyRows
+ table = Table(table_matrix, colWidths=colWidths, repeatRows=1)
+
table_header_style = styles.get("table_header", {})
table_cell_style = styles.get("table_cell", {})
-
+
table_style = [
- ('BACKGROUND', (0, 0), (-1, 0), self._hexToColor(table_header_style.get("background", "#4F4F4F"))),
- ('TEXTCOLOR', (0, 0), (-1, 0), self._hexToColor(table_header_style.get("text_color", "#FFFFFF"))),
- ('ALIGN', (0, 0), (-1, -1), self._getTableAlignment(table_cell_style.get("align", "left"))),
- ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold' if table_header_style.get("bold", True) else 'Helvetica'),
- ('FONTSIZE', (0, 0), (-1, 0), table_header_style.get("font_size", 12)),
- ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
- ('BACKGROUND', (0, 1), (-1, -1), self._hexToColor(table_cell_style.get("background", "#FFFFFF"))),
- ('FONTSIZE', (0, 1), (-1, -1), table_cell_style.get("font_size", 10)),
- ('GRID', (0, 0), (-1, -1), 1, colors.black)
+ ("BACKGROUND", (0, 0), (-1, 0), self._hexToColor(table_header_style.get("background", "#4F4F4F"))),
+ ("BACKGROUND", (0, 1), (-1, -1), self._hexToColor(table_cell_style.get("background", "#FFFFFF"))),
+ ("ALIGN", (0, 0), (-1, -1), "LEFT"),
+ ("VALIGN", (0, 0), (-1, -1), "TOP"),
+ ("LEFTPADDING", (0, 0), (-1, -1), 4),
+ ("RIGHTPADDING", (0, 0), (-1, -1), 4),
+ ("TOPPADDING", (0, 0), (-1, 0), 6),
+ ("BOTTOMPADDING", (0, 0), (-1, 0), 8),
+ ("TOPPADDING", (0, 1), (-1, -1), 4),
+ ("BOTTOMPADDING", (0, 1), (-1, -1), 4),
+ ("GRID", (0, 0), (-1, -1), 0.5, colors.black),
]
-
table.setStyle(TableStyle(table_style))
-
return [table, Spacer(1, 12)]
except Exception as e:
@@ -674,9 +800,16 @@ class RendererPdf(BaseRenderer):
elements = []
for item in items:
if isinstance(item, str):
- elements.append(Paragraph(f"• {item}", self._createNormalStyle(styles)))
+ elements.append(
+ Paragraph(f"• {self._markdownInlineToReportlabXml(item)}", self._createNormalStyle(styles))
+ )
elif isinstance(item, dict) and "text" in item:
- elements.append(Paragraph(f"• {item['text']}", self._createNormalStyle(styles)))
+ elements.append(
+ Paragraph(
+ f"• {self._markdownInlineToReportlabXml(item['text'])}",
+ self._createNormalStyle(styles),
+ )
+ )
if elements:
elements.append(Spacer(1, bullet_style_def.get("space_after", 3)))
@@ -700,7 +833,7 @@ class RendererPdf(BaseRenderer):
if text:
level = max(1, min(6, level))
heading_style = self._createHeadingStyle(styles, level)
- return [Paragraph(text, heading_style)]
+ return [self._paragraphFromInlineMarkdown(text, heading_style)]
return []
@@ -721,7 +854,7 @@ class RendererPdf(BaseRenderer):
text = ""
if text:
- return [Paragraph(text, self._createNormalStyle(styles))]
+ return [self._paragraphFromInlineMarkdown(text, self._createNormalStyle(styles))]
return []
@@ -741,27 +874,81 @@ class RendererPdf(BaseRenderer):
code_style_def = styles.get("code_block", {})
if code:
+ code = _prepareCodeBlockPlainText(code)
+ code = _normalizePdfMonospaceText(code)
elements = []
-
+ fs = code_style_def.get("font_size", 9)
+ mono = code_style_def.get("font", "Courier")
+
if language:
lang_style = ParagraphStyle(
- 'CodeLanguage',
- fontSize=code_style_def.get("font_size", 9),
+ "CodeLanguage",
+ fontSize=fs,
textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")),
- fontName='Helvetica-Bold'
+ fontName="Helvetica-Bold",
+ alignment=TA_LEFT,
)
- elements.append(Paragraph(f"Code ({language}):", lang_style))
-
- code_style = ParagraphStyle(
- 'CodeBlock',
- fontSize=code_style_def.get("font_size", 9),
- textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")),
- fontName=code_style_def.get("font", "Courier"),
- backColor=self._hexToColor(code_style_def.get("background", "#F5F5F5")),
- spaceAfter=code_style_def.get("space_after", 6)
- )
- elements.append(Paragraph(code, code_style))
-
+ elements.append(
+ Paragraph(
+ self._escapeReportlabXml(f"Code ({language}):"),
+ lang_style,
+ )
+ )
+
+ approxCharWPt = max(fs * 0.52, 4.5)
+ usableWidth = _PDF_CONTENT_WIDTH_PT - 16 # left+right padding
+ maxLineChars = max(48, int(usableWidth / approxCharWPt))
+ bg_col = self._hexToColor(code_style_def.get("background", "#F5F5F5"))
+ leading = fs * 1.2
+ spaceAfter = code_style_def.get("space_after", 6)
+
+ # Each source line may wrap to ceil(len/maxLineChars) visual lines.
+ # Frame height ~740pt minus padding → keep rendered height < 600pt.
+ maxVisualLinesPerChunk = max(8, int(600 / leading))
+ srcLines = code.split("\n")
+ chunks: List[List[str]] = []
+ curChunk: List[str] = []
+ curVisual = 0
+ for sl in srcLines:
+ wrapped = max(1, -(-len(sl) // maxLineChars)) if sl else 1
+ if curVisual + wrapped > maxVisualLinesPerChunk and curChunk:
+ chunks.append(curChunk)
+ curChunk = []
+ curVisual = 0
+ curChunk.append(sl)
+ curVisual += wrapped
+ if curChunk:
+ chunks.append(curChunk)
+
+ for ci, chunkLines in enumerate(chunks):
+ chunkText = "\n".join(chunkLines)
+ styleId = f"CodePre_{id(code_data) & 0xFFFFFFFF}_{ci}"
+ codePrStyle = ParagraphStyle(
+ styleId,
+ fontName=mono,
+ fontSize=fs,
+ leading=leading,
+ textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")),
+ alignment=TA_LEFT,
+ leftIndent=0,
+ rightIndent=0,
+ )
+ pf = Preformatted(chunkText, codePrStyle, dedent=0, maxLineLength=maxLineChars)
+ tbl = Table([[pf]], colWidths=[_PDF_CONTENT_WIDTH_PT])
+ tbl.setStyle(
+ TableStyle(
+ [
+ ("BACKGROUND", (0, 0), (-1, -1), bg_col),
+ ("VALIGN", (0, 0), (-1, -1), "TOP"),
+ ("LEFTPADDING", (0, 0), (-1, -1), 8),
+ ("RIGHTPADDING", (0, 0), (-1, -1), 8),
+ ("TOPPADDING", (0, 0), (-1, -1), 6),
+ ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
+ ]
+ )
+ )
+ tbl.spaceAfter = 0 if ci < len(chunks) - 1 else spaceAfter
+ elements.append(tbl)
return elements
return []
diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py
index 800b21ba..3bdff7f1 100644
--- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py
+++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py
@@ -13,6 +13,15 @@ from modules.datamodels.datamodelDocument import RenderedDocument
logger = logging.getLogger(__name__)
+_PPTX_MD_INLINE_RE = re.compile(
+ r"(\*\*(.+?)\*\*)"
+ r"|(__(.+?)__)"
+ r"|(? 0
hasImages = len(slide_images) > 0
+ isTitleSlide = slide_data.get("_isTitleSlide", False)
- logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - sections: {len(slide_sections)}, images: {len(slide_images)}, content: {len(slide_content)} chars")
+ logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - sections: {len(slide_sections)}, images: {len(slide_images)}, content: {len(slide_content)} chars, titleSlide={isTitleSlide}")
- # Use blank layout for all slides to avoid placeholder interference
- # Find blank layout (typically index 6, fallback to 5)
+ # Title slide uses the built-in Title Slide layout (index 0)
+ if isTitleSlide:
+ titleLayout = prs.slide_layouts[0]
+ slide = prs.slides.add_slide(titleLayout)
+ try:
+ titleShape = slide.shapes.title
+ titleShape.text = slide_data.get("title", "")
+ titleStyle = styles.get("title", {})
+ tf = titleShape.text_frame
+ if tf.paragraphs:
+ p = tf.paragraphs[0]
+ p.font.size = Pt(titleStyle.get("font_size", 36))
+ p.font.bold = titleStyle.get("bold", True)
+ tColor = self._getSafeColor(titleStyle.get("color", (31, 78, 121)))
+ p.font.color.rgb = RGBColor(*tColor)
+ except Exception as titleErr:
+ logger.warning(f"Could not style title slide: {titleErr}")
+ # Clear subtitle placeholder
+ try:
+ sub = slide.placeholders[1]
+ sub.text = ""
+ except (KeyError, IndexError):
+ pass
+ continue
+
+ # Content slides: use blank layout
slideLayoutIndex = None
for idx in [6, 5]:
if idx < len(prs.slide_layouts):
try:
layout = prs.slide_layouts[idx]
- # Check if it's a blank layout (no placeholders)
if len(layout.placeholders) == 0:
slideLayoutIndex = idx
break
except (AttributeError, IndexError):
continue
- # If no blank layout found, use layout with fewest placeholders
if slideLayoutIndex is None:
- min_placeholders = float('inf')
+ minPh = float('inf')
for idx in range(len(prs.slide_layouts)):
try:
layout = prs.slide_layouts[idx]
- placeholder_count = len(layout.placeholders) if hasattr(layout, 'placeholders') else 0
- if placeholder_count < min_placeholders:
- min_placeholders = placeholder_count
+ phCount = len(layout.placeholders) if hasattr(layout, 'placeholders') else 0
+ if phCount < minPh:
+ minPh = phCount
slideLayoutIndex = idx
except:
continue
- # Fallback to first layout if still None
if slideLayoutIndex is None:
slideLayoutIndex = 0
slide_layout = prs.slide_layouts[slideLayoutIndex]
slide = prs.slides.add_slide(slide_layout)
- # Clear placeholder text instead of removing placeholders (safer approach)
- # This avoids corrupting the PPTX file structure
try:
for shape in slide.shapes:
if hasattr(shape, 'is_placeholder') and shape.is_placeholder:
try:
if hasattr(shape, 'text_frame'):
shape.text_frame.clear()
- # Set text to empty string to remove "Click to add text"
if len(shape.text_frame.paragraphs) > 0:
shape.text_frame.paragraphs[0].text = ""
except:
@@ -156,7 +184,7 @@ class RendererPptx(BaseRenderer):
except Exception as placeholder_error:
logger.warning(f"Could not clear placeholders: {str(placeholder_error)}")
- # Add title as textbox (smaller size for slides)
+ # Add title as textbox
from pptx.util import Inches
titleBox = slide.shapes.add_textbox(Inches(0.5), Inches(0.2), prs.slide_width - Inches(1), Inches(0.6))
titleFrame = titleBox.text_frame
@@ -232,15 +260,14 @@ class RendererPptx(BaseRenderer):
else:
p.alignment = PP_ALIGN.LEFT
- # If no slides were created, create a default slide
+ # If no slides were created, create a single slide with the document title
if not slidesData:
- slide_layout = prs.slide_layouts[0] # Title slide layout
+ slide_layout = prs.slide_layouts[0]
slide = prs.slides.add_slide(slide_layout)
title_shape = slide.shapes.title
title_shape.text = title
- # Apply title styling to default slide
title_style = styles.get("title", {})
if title_shape.text_frame.paragraphs[0].font:
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48))
@@ -248,16 +275,12 @@ class RendererPptx(BaseRenderer):
title_color = self._getSafeColor(title_style.get("color", (31, 78, 121)))
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
- subtitle_shape = slide.placeholders[1]
- subtitle_shape.text = "Generated by PowerOn AI System"
-
- # Apply subtitle styling
- paragraph_style = styles.get("paragraph", {})
- if subtitle_shape.text_frame.paragraphs[0].font:
- subtitle_shape.text_frame.paragraphs[0].font.size = Pt(paragraph_style.get("font_size", 20))
- subtitle_shape.text_frame.paragraphs[0].font.bold = paragraph_style.get("bold", False)
- paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
- subtitle_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*paragraph_color)
+ # Clear subtitle placeholder instead of adding filler text
+ try:
+ subtitle_shape = slide.placeholders[1]
+ subtitle_shape.text = ""
+ except (KeyError, IndexError):
+ pass
# Save to buffer
buffer = io.BytesIO()
@@ -625,24 +648,23 @@ JSON ONLY. NO OTHER TEXT."""
sections = self._extractSections(json_content)
metadata = self._extractMetadata(json_content)
- # Use provided title (which comes from documents[].title) as primary source
- # Fallback to metadata.title only if title parameter is empty
document_title = title if title else metadata.get("title", "Generated Document")
- # Create title slide
+ # Title slide (clean — just the document title, no filler text)
slides.append({
"title": document_title,
- "content": "Generated by PowerOn AI System\n\n" + self._formatTimestamp()
+ "content": "",
+ "_isTitleSlide": True,
})
- # Process sections into slides based on content and user intent
- slides.extend(self._createSlidesFromSections(sections, styles))
-
- # If no content slides were created, create a default content slide
- if len(slides) == 1: # Only title slide
+ # Content slides split by chapter headings
+ contentSlides = self._createSlidesFromSections(sections, styles)
+ if contentSlides:
+ slides.extend(contentSlides)
+ else:
slides.append({
"title": "Content Overview",
- "content": "No structured content found in the source documents.\n\nPlease check the source documents and try again."
+ "content": ""
})
return slides
@@ -941,9 +963,8 @@ JSON ONLY. NO OTHER TEXT."""
content = slide_data.get("content", "")
title = slide_data.get("title", "")
- # Check if it's a title slide (first slide)
- if not content or "Generated by PowerOn AI System" in content:
- return 0 # Title slide layout
+ if not content:
+ return 0
# Professional layout selection based on content
if "|" in content and "-" in content:
@@ -970,67 +991,71 @@ JSON ONLY. NO OTHER TEXT."""
return 1 # Default to title and content layout
def _createSlidesFromSections(self, sections: List[Dict[str, Any]], styles: Dict[str, Any]) -> List[Dict[str, Any]]:
- """Create slides from sections: each heading level 1 (chapter) creates a new slide, content accumulates until next level 1 heading."""
+ """Create slides from sections: each top-level heading creates a new slide.
+
+ The split level is determined dynamically: if there is exactly one H1 (the
+ document title), chapters are H2; otherwise chapters are H1.
+ """
try:
+ # First pass: discover heading levels to choose the split level
+ headingLevels: List[int] = []
+ for section in sections:
+ if section.get("content_type") == "heading":
+ for el in section.get("elements", []):
+ if isinstance(el, dict):
+ c = el.get("content", {})
+ if isinstance(c, dict):
+ headingLevels.append(c.get("level", 1))
+
+ h1Count = headingLevels.count(1)
+ h2Count = headingLevels.count(2)
+ # If there's at most one H1 but multiple H2s, split on H2
+ splitLevel = 2 if h1Count <= 1 and h2Count > 1 else 1
+
slides = []
- current_slide_sections = [] # Store sections (not formatted text) for proper rendering
- current_slide_title = "Content Overview"
+ currentSlideSections = []
+ currentSlideTitle = "Content Overview"
for section in sections:
- section_type = section.get("content_type", "paragraph")
+ sectionType = section.get("content_type", "paragraph")
elements = section.get("elements", [])
- # Skip sections with no elements (unless they're headings that should create new slides)
- if not elements and section_type != "heading":
+ if not elements and sectionType != "heading":
continue
- if section_type == "heading":
- # Extract heading level
- level = 1 # Default
- heading_text = ""
+ if sectionType == "heading":
+ level = 1
+ headingText = ""
for element in elements:
if isinstance(element, dict):
- # Extract from nested content structure
content = element.get("content", {})
if isinstance(content, dict):
- heading_text = content.get("text", "")
+ headingText = content.get("text", "")
level = content.get("level", 1)
elif isinstance(content, str):
- heading_text = content
+ headingText = content
level = 1
- # Only level 1 headings (chapters) create new slides
- if level == 1:
- # If we have accumulated content, create a slide
- if current_slide_sections:
+ if level <= splitLevel:
+ if currentSlideSections:
slides.append({
- "title": current_slide_title,
- "sections": current_slide_sections.copy(), # Store sections for proper rendering
+ "title": currentSlideTitle,
+ "sections": currentSlideSections.copy(),
"images": []
})
- current_slide_sections = []
-
- # Start new slide with heading as title
- if heading_text:
- current_slide_title = heading_text
- else:
- # If no heading text found but this is a heading section, use section ID or default
- current_slide_title = section.get("id", "Untitled Section")
+ currentSlideSections = []
+ currentSlideTitle = headingText or section.get("id", "Untitled Section")
else:
- # Level 2+ headings are added as sections to current slide
- current_slide_sections.append(section)
- elif section_type == "image":
- # Images are added to current slide (will be organized in frames)
- current_slide_sections.append(section)
+ currentSlideSections.append(section)
+ elif sectionType == "image":
+ currentSlideSections.append(section)
else:
- # Add section to current slide (will be rendered properly)
- current_slide_sections.append(section)
+ currentSlideSections.append(section)
- # Add final slide if there's content
- if current_slide_sections:
+ if currentSlideSections:
slides.append({
- "title": current_slide_title,
- "sections": current_slide_sections.copy(),
+ "title": currentSlideTitle,
+ "sections": currentSlideSections.copy(),
"images": []
})
@@ -1225,14 +1250,66 @@ JSON ONLY. NO OTHER TEXT."""
import traceback
logger.error(f"Traceback: {traceback.format_exc()}")
- def _addTableToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], top: float, max_width: float = None) -> None:
+ def _addMarkdownInlineRuns(self, paragraph, text: str, fontSize=None, fontColor=None, fontBold=None) -> None:
+ """Parse markdown inline formatting and add Runs to a pptx paragraph.
+
+ Every piece of text is added as an explicit Run with font properties set,
+ so the paragraph never falls back to the slide-master default font.
+ """
+ from pptx.util import Pt
+
+ paragraph.text = ""
+
+ def _applyBase(run, bold=None):
+ if fontSize:
+ run.font.size = fontSize
+ if fontColor:
+ run.font.color.rgb = fontColor
+ if bold is not None:
+ run.font.bold = bold
+ elif fontBold is not None:
+ run.font.bold = fontBold
+
+ pos = 0
+ for m in _PPTX_MD_INLINE_RE.finditer(text):
+ if m.start() > pos:
+ r = paragraph.add_run()
+ r.text = text[pos:m.start()]
+ _applyBase(r)
+ if m.group(2) or m.group(4):
+ r = paragraph.add_run()
+ r.text = m.group(2) or m.group(4)
+ _applyBase(r, bold=True)
+ elif m.group(5) or m.group(6):
+ r = paragraph.add_run()
+ r.text = m.group(5) or m.group(6)
+ r.font.italic = True
+ _applyBase(r)
+ elif m.group(7):
+ r = paragraph.add_run()
+ r.text = m.group(7)
+ r.font.name = "Courier New"
+ if fontSize and hasattr(fontSize, 'pt'):
+ r.font.size = Pt(max(8, int(fontSize.pt * 0.85)))
+ elif fontSize:
+ r.font.size = fontSize
+ if fontColor:
+ r.font.color.rgb = fontColor
+ pos = m.end()
+
+ # Remaining tail (or entire string if no matches)
+ if pos < len(text):
+ r = paragraph.add_run()
+ r.text = text[pos:]
+ _applyBase(r)
+
+ def _addTableToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], top: float = None, max_width: float = None) -> None:
"""Add a PowerPoint table to slide."""
try:
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
- # Extract from nested content structure
content = element.get("content", {})
if not isinstance(content, dict):
return
@@ -1243,11 +1320,9 @@ JSON ONLY. NO OTHER TEXT."""
if not headers:
return
- # Calculate table dimensions
- num_cols = int(len(headers)) # Ensure integer
- num_rows = int(len(rows) + 1) # +1 for header row, ensure integer
+ num_cols = int(len(headers))
+ num_rows = int(len(rows) + 1)
left = Inches(0.5)
- # Get presentation from stored reference or slide
if hasattr(self, '_currentPresentation'):
prs = self._currentPresentation
else:
@@ -1255,7 +1330,15 @@ JSON ONLY. NO OTHER TEXT."""
width = max_width if max_width is not None else (prs.slide_width - Inches(1))
row_height = Inches(0.4)
- # Create table - ensure all parameters are proper types
+ # Auto-calculate top from existing shapes when not specified
+ if top is None:
+ maxBottom = Inches(1.5)
+ for shape in slide.shapes:
+ shapeBottom = shape.top + shape.height
+ if shapeBottom > maxBottom:
+ maxBottom = shapeBottom
+ top = maxBottom + Inches(0.15)
+
table_height = row_height * num_rows
table_shape = slide.shapes.add_table(num_rows, num_cols, left, top, width, table_height)
table = table_shape.table
@@ -1361,109 +1444,49 @@ JSON ONLY. NO OTHER TEXT."""
logger.warning(f"Error adding table to slide: {str(e)}")
def _addBulletListToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None:
- """Add bullet list to slide text frame."""
+ """Add bullet list to slide text frame with consistent formatting."""
try:
from pptx.util import Pt
from pptx.dml.color import RGBColor
from pptx.enum.text import PP_ALIGN
- # Extract from nested content structure
content = element.get("content", {})
if not isinstance(content, dict):
return
-
items = content.get("items", [])
if not items:
return
- list_style = styles.get("bullet_list", {})
- base_font_size = list_style.get("font_size", 14)
- calculated_size = max(10, int(base_font_size * font_size_multiplier)) # Minimum 10pt for readability
+ listStyle = styles.get("paragraph", {})
+ fontSize = Pt(max(10, int(listStyle.get("font_size", 14) * font_size_multiplier)))
+ fontColor = RGBColor(*self._getSafeColor(listStyle.get("color", (47, 47, 47))))
- # Pre-calculate and cache style objects to avoid repeated parsing
- font_size_pt = Pt(calculated_size)
- text_color = self._getSafeColor(list_style.get("color", (47, 47, 47)))
- text_color_rgb = RGBColor(*text_color)
- space_before_pt = Pt(2)
- space_after_pt = Pt(2)
-
- logger.debug(f"Rendering bullet list with {len(items)} items")
-
- for idx, item in enumerate(items):
- try:
- # Get text content first
- if isinstance(item, dict):
- item_text = item.get("text", "")
- else:
- item_text = str(item)
-
- # Skip empty items
- if not item_text or len(item_text.strip()) == 0:
- logger.debug(f"Skipping empty bullet item {idx}")
- continue
-
- # Create new paragraph for each bullet item
- p = text_frame.add_paragraph()
-
- # Set level to 1 for bullet points BEFORE setting text
- # In python-pptx, setting level > 0 should automatically enable bullets
- p.level = 1
-
- # Set text content
- p.text = item_text
-
- # Apply formatting - use cached objects
- p.font.size = font_size_pt
- p.font.color.rgb = text_color_rgb
- p.alignment = PP_ALIGN.LEFT # Left align bullet lists
- p.space_before = space_before_pt # Small spacing before
- p.space_after = space_after_pt # Small spacing after
-
- # In python-pptx, setting level > 0 should enable bullets automatically
- # However, some versions may not support paragraph_format, so we'll use manual bullets as fallback
- # Always add manual bullet character to ensure visibility
- if not (p.text.startswith('•') or p.text.startswith('-') or p.text.startswith('*') or p.text.startswith('◦')):
- p.text = '• ' + p.text
- logger.debug(f"Added manual bullet character to item {idx}")
-
- # Set proper indentation for multiline bullets (hanging indent)
- # For multiline bullets: bullet at left margin, text indented, wrapped lines align with text
- try:
- # Try accessing paragraph_format - it may not exist in all python-pptx versions
- if hasattr(p, 'paragraph_format'):
- pf = p.paragraph_format
- # Left indent: indents the entire paragraph (bullet + text)
- pf.left_indent = Pt(18)
- # First line indent: negative value creates hanging indent
- # This brings the bullet back to the left while keeping text indented
- pf.first_line_indent = Pt(-18) # Negative to create hanging indent
- logger.debug(f"Set hanging indent for bullet item {idx}")
- else:
- # Try via _element if paragraph_format not available
- try:
- from pptx.util import Pt as PtUtil
- pPr = p._element.get_or_add_pPr()
- # Set left margin (indents entire paragraph)
- pPr.left_margin = PtUtil(18)
- # Set first line indent (negative for hanging indent)
- pPr.first_line_indent = PtUtil(-18)
- logger.debug(f"Set hanging indent via XML for bullet item {idx}")
- except Exception as xml_error:
- logger.debug(f"Could not set hanging indent via XML: {str(xml_error)}")
- # Indentation is optional, continue without it
- pass
- except Exception as indent_error:
- logger.debug(f"Could not set indent for item {idx}: {str(indent_error)}")
- # Continue without indent - bullets will still show, but multiline won't be properly indented
-
- logger.debug(f"Successfully added bullet item {idx}: '{item_text[:50]}...'")
-
- except Exception as item_error:
- logger.error(f"Error adding bullet item {idx}: {str(item_error)}", exc_info=True)
- # Continue with next item even if one fails
+ for item in items:
+ itemText = item.get("text", "") if isinstance(item, dict) else str(item)
+ if not itemText or not itemText.strip():
continue
-
- logger.debug(f"Completed rendering bullet list, added {len(text_frame.paragraphs)} paragraphs")
+
+ p = text_frame.add_paragraph()
+ p.level = 0
+ p.alignment = PP_ALIGN.LEFT
+ p.space_before = Pt(2)
+ p.space_after = Pt(2)
+
+ # Consistent bullet prefix
+ self._addMarkdownInlineRuns(p, f" • {itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
+
+ # Subitems
+ if isinstance(item, dict):
+ for sub in item.get("subitems", []):
+ subText = sub.get("text", "") if isinstance(sub, dict) else str(sub)
+ if not subText:
+ continue
+ sp = text_frame.add_paragraph()
+ sp.level = 0
+ sp.alignment = PP_ALIGN.LEFT
+ sp.space_before = Pt(1)
+ sp.space_after = Pt(1)
+ self._addMarkdownInlineRuns(sp, f" – {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
except Exception as e:
logger.warning(f"Error adding bullet list to slide: {str(e)}")
@@ -1484,25 +1507,22 @@ JSON ONLY. NO OTHER TEXT."""
if text:
p = text_frame.add_paragraph()
- p.text = text
- # Headings should be level 0 (no indentation) regardless of heading level
p.level = 0
heading_style = styles.get("heading", {})
- # Different font sizes for different heading levels
if level == 1:
- base_font_size = heading_style.get("font_size", 28) # Largest for H1
+ base_font_size = heading_style.get("font_size", 28)
elif level == 2:
- base_font_size = heading_style.get("font_size", 22) # Medium for H2
+ base_font_size = heading_style.get("font_size", 22)
elif level == 3:
- base_font_size = heading_style.get("font_size", 18) # Smaller for H3
+ base_font_size = heading_style.get("font_size", 18)
else:
- base_font_size = heading_style.get("font_size", 16) # Default for H4+
+ base_font_size = heading_style.get("font_size", 16)
- calculated_size = max(12, int(base_font_size * font_size_multiplier)) # Minimum 12pt for headings
- p.font.size = Pt(calculated_size)
- p.font.bold = heading_style.get("bold", True)
- p.font.color.rgb = RGBColor(*self._getSafeColor(heading_style.get("color", (31, 78, 121))))
+ calculated_size = max(12, int(base_font_size * font_size_multiplier))
+ fSize = Pt(calculated_size)
+ fColor = RGBColor(*self._getSafeColor(heading_style.get("color", (31, 78, 121))))
+ self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=True)
# Add spacing before and after headings
p.space_before = Pt(12 if level == 1 else 8) # More space before H1
p.space_after = Pt(6) # Space after heading
@@ -1528,11 +1548,8 @@ JSON ONLY. NO OTHER TEXT."""
if text:
p = text_frame.add_paragraph()
- p.text = text
- # Explicitly set level to 0 for regular paragraphs (not bullets)
p.level = 0
- # Ensure no bullet formatting
try:
if hasattr(p, 'paragraph_format'):
p.paragraph_format.bullet.type = None
@@ -1540,11 +1557,12 @@ JSON ONLY. NO OTHER TEXT."""
pass
paragraph_style = styles.get("paragraph", {})
- base_font_size = paragraph_style.get("font_size", 14) # Smaller default for better readability
- calculated_size = max(10, int(base_font_size * font_size_multiplier)) # Minimum 10pt for readability
- p.font.size = Pt(calculated_size)
- p.font.bold = paragraph_style.get("bold", False)
- p.font.color.rgb = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47))))
+ base_font_size = paragraph_style.get("font_size", 14)
+ calculated_size = max(10, int(base_font_size * font_size_multiplier))
+ fSize = Pt(calculated_size)
+ fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47))))
+ fBold = paragraph_style.get("bold", False)
+ self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=fBold)
# Add proper spacing
p.space_before = Pt(6) # Space before paragraph
@@ -1604,261 +1622,31 @@ JSON ONLY. NO OTHER TEXT."""
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
def _renderSlideContentWithFrames(self, slide, slide_sections: List[Dict[str, Any]], slide_images: List[Dict[str, Any]], styles: Dict[str, Any], prs) -> None:
- """
- Organize slide content into frames for better layout.
- Groups content by type (images, bullet lists, paragraphs, tables) and renders each in appropriately sized frames.
- """
+ """Render all sections sequentially: text/bullets/headings into a shared
+ textbox, tables and images as separate shapes placed below."""
try:
from pptx.util import Inches, Pt
- from pptx.enum.text import PP_ALIGN
- from pptx.dml.color import RGBColor
-
- # Extract images from sections first
- images_to_render = list(slide_images) if slide_images else []
- text_sections = []
- table_sections = []
-
- for section in slide_sections:
- section_type = section.get("content_type", "paragraph")
- elements = section.get("elements", [])
-
- if not elements:
- # Skip empty sections
- continue
-
- # Extract images from all sections
- section_has_images = False
- for element in elements:
- if isinstance(element, dict) and element.get("type") == "image":
- content = element.get("content", {})
- base64Data = None
-
- # Handle different content formats
- if isinstance(content, dict):
- base64Data = content.get("base64Data")
- altText = content.get("altText", "Image")
- caption = content.get("caption", "")
- elif isinstance(content, str):
- # If content is a string, it might be base64 data directly
- # Check if it looks like base64
- if len(content) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in content[:100]):
- base64Data = content
- altText = "Image"
- caption = ""
- else:
- # Not base64, skip
- continue
- else:
- # Try to get base64Data directly from element
- base64Data = element.get("base64Data")
- altText = element.get("altText", "Image")
- caption = element.get("caption", "")
-
- if base64Data:
- images_to_render.append({
- "base64Data": base64Data,
- "altText": altText,
- "caption": caption
- })
- section_has_images = True
-
- # Skip image-only sections (they're already added to images_to_render)
- if section_type == "image" and section_has_images:
- continue
-
- # Categorize sections (excluding image elements)
- has_table = False
- non_image_elements = []
-
- for element in elements:
- if isinstance(element, dict):
- element_type = element.get("type", "")
- # Skip image elements when categorizing
- if element_type == "image":
- continue
- if element_type == "table" or section_type == "table":
- has_table = True
- non_image_elements.append(element)
-
- # Only add sections that have non-image content
- if non_image_elements:
- if has_table:
- # Create a copy of section without image elements for table rendering
- table_section = {
- **section,
- "elements": non_image_elements
- }
- table_sections.append(table_section)
- else:
- # Create a copy of section without image elements for text rendering
- text_section = {
- **section,
- "elements": non_image_elements
- }
- text_sections.append(text_section)
-
- # Calculate layout dimensions
- title_height = Inches(1.5)
- available_height = prs.slide_height - title_height - Inches(0.5) # Title + margin
- available_width = prs.slide_width - Inches(1) # Margins
+
margin = Inches(0.5)
-
- current_y = title_height + Inches(0.3)
-
- # Determine layout strategy based on content types
- has_images = len(images_to_render) > 0
- has_tables = len(table_sections) > 0
- has_text = len(text_sections) > 0
-
- # Layout 1: Images + Text (horizontal split for landscape)
- if has_images and has_text and not has_tables:
- # Horizontal split: images on left, text on right (landscape format)
- img_width = available_width * 0.48
- text_width = available_width * 0.48
- img_left = margin
- text_left = margin + img_width + Inches(0.2)
-
- # Render images in left column (full height)
- if images_to_render:
- img_height = available_height - Inches(0.2)
- self._addImagesToSlideInFrame(slide, images_to_render, styles, img_left, current_y, img_width, img_height)
-
- # Render text in right column (full height, adaptive font size)
- if text_sections:
- text_height = available_height - Inches(0.2)
- self._renderTextSectionsInFrame(slide, text_sections, styles, text_left, current_y, text_width, text_height, adaptiveFontSize=True)
-
- # Layout 2: Tables + Text (horizontal split for landscape)
- elif has_tables and has_text:
- # Horizontal split: tables on left, text on right (landscape format)
- table_width = available_width * 0.48
- text_width = available_width * 0.48
- table_left = margin
- text_left = margin + table_width + Inches(0.2)
-
- # Render tables in left column (full height)
- table_y = current_y
- for table_section in table_sections:
- elements = table_section.get("elements", [])
- for element in elements:
- if isinstance(element, dict) and element.get("type") == "table":
- try:
- self._addTableToSlide(slide, element, styles, table_y, max_width=table_width)
- # Calculate actual table height
- content = element.get("content", {})
- if isinstance(content, dict):
- rows = content.get("rows", [])
- num_rows = len(rows) + 1 # +1 for header
- actual_height = Inches(0.4) * num_rows
- table_y += actual_height + Inches(0.15)
- else:
- table_y += Inches(2)
- except Exception as table_error:
- logger.error(f"Error rendering table: {str(table_error)}")
- # Continue with next table
- break
-
- # Render text in right column (full height, adaptive font size)
- if text_sections:
- text_height = available_height - Inches(0.2)
- self._renderTextSectionsInFrame(slide, text_sections, styles, text_left, current_y, text_width, text_height, adaptiveFontSize=True)
-
- # Layout 3: Images + Tables + Text (horizontal split for landscape)
- elif has_images and has_tables and has_text:
- # Horizontal split: Images (left), Tables (middle), Text (right)
- img_width = available_width * 0.31
- table_width = available_width * 0.31
- text_width = available_width * 0.31
- img_left = margin
- table_left = margin + img_width + Inches(0.15)
- text_left = margin + img_width + table_width + Inches(0.3)
-
- # Render images in left column (full height)
- if images_to_render:
- img_height = available_height - Inches(0.2)
- self._addImagesToSlideInFrame(slide, images_to_render, styles, img_left, current_y, img_width, img_height)
-
- # Render tables in middle column (full height)
- table_y = current_y
- for table_section in table_sections:
- elements = table_section.get("elements", [])
- for element in elements:
- if isinstance(element, dict) and element.get("type") == "table":
- try:
- self._addTableToSlide(slide, element, styles, table_y, max_width=table_width)
- content = element.get("content", {})
- if isinstance(content, dict):
- rows = content.get("rows", [])
- num_rows = len(rows) + 1
- actual_height = Inches(0.4) * num_rows
- table_y += actual_height + Inches(0.15)
- else:
- table_y += Inches(2)
- except Exception as table_error:
- logger.error(f"Error rendering table: {str(table_error)}")
- break
-
- # Render text in right column (full height, adaptive font size)
- if text_sections:
- text_height = available_height - Inches(0.2)
- self._renderTextSectionsInFrame(slide, text_sections, styles, text_left, current_y, text_width, text_height, adaptiveFontSize=True)
-
- # Layout 4: Images only
- elif has_images and not has_text and not has_tables:
- img_width = available_width * 0.8
- img_height = available_height * 0.8
- img_left = (available_width - img_width) / 2 + margin
- self._addImagesToSlideInFrame(slide, images_to_render, styles, img_left, current_y, img_width, img_height)
-
- # Layout 5: Text only (default, adaptive font size)
- elif has_text and not has_images and not has_tables:
- text_height = available_height - Inches(0.2)
- self._renderTextSectionsInFrame(slide, text_sections, styles, margin, current_y, available_width, text_height, adaptiveFontSize=True)
-
- # Layout 6: Tables only
- elif has_tables and not has_images and not has_text:
- table_height = available_height / max(len(table_sections), 1)
- table_width = available_width
- for table_section in table_sections:
- elements = table_section.get("elements", [])
- for element in elements:
- if isinstance(element, dict) and element.get("type") == "table":
- try:
- self._addTableToSlide(slide, element, styles, current_y, max_width=table_width)
- # Calculate actual table height
- content = element.get("content", {})
- if isinstance(content, dict):
- rows = content.get("rows", [])
- num_rows = len(rows) + 1 # +1 for header
- actual_height = min(Inches(0.4) * num_rows, table_height)
- current_y += actual_height + Inches(0.2)
- else:
- current_y += table_height + Inches(0.2)
- except Exception as table_error:
- logger.error(f"Error rendering table: {str(table_error)}")
- # Continue with next table
- break
-
- except Exception as e:
- logger.error(f"Error rendering slide content with frames: {str(e)}")
- # Fallback to simple rendering
- try:
- content_shape = slide.placeholders[1]
- text_frame = content_shape.text_frame
- text_frame.clear()
- except (AttributeError, IndexError):
- from pptx.util import Inches
- left = Inches(0.5)
- top = Inches(1.5)
- width = prs.slide_width - Inches(1)
- height = prs.slide_height - top - Inches(0.5)
- textbox = slide.shapes.add_textbox(left, top, width, height)
- text_frame = textbox.text_frame
- text_frame.word_wrap = True
-
- # Simple fallback rendering
+ contentTop = Inches(1.3)
+ availableWidth = prs.slide_width - Inches(1)
+ availableHeight = prs.slide_height - contentTop - Inches(0.3)
+
+ # Create a single textbox for all non-table, non-image content
+ textbox = slide.shapes.add_textbox(margin, contentTop, availableWidth, availableHeight)
+ textFrame = textbox.text_frame
+ textFrame.word_wrap = True
+ textFrame.auto_size = None
+
for section in slide_sections:
- self._renderSectionToTextFrame(slide, section, styles, text_frame, font_size_multiplier=1.0)
+ self._renderSectionToTextFrame(slide, section, styles, textFrame, font_size_multiplier=1.0)
+
+ # Render standalone images that were passed alongside sections
+ if slide_images:
+ self._addImagesToSlideInFrame(slide, slide_images, styles, margin, contentTop, availableWidth, availableHeight)
+
+ except Exception as e:
+ logger.error(f"Error rendering slide content: {str(e)}")
def _renderTextSectionsInFrame(self, slide, text_sections: List[Dict[str, Any]], styles: Dict[str, Any], left: float, top: float, width: float, height: float, adaptiveFontSize: bool = False) -> None:
"""Render text sections (paragraphs, lists, headings) in a text frame."""
@@ -1935,6 +1723,14 @@ JSON ONLY. NO OTHER TEXT."""
except Exception as e:
logger.warning(f"Error rendering text sections in frame: {str(e)}")
+ @staticmethod
+ def _isHorizontalRule(element: Dict[str, Any]) -> bool:
+ """Detect markdown horizontal rules (---, ***, ___) that should be skipped on slides."""
+ content = element.get("content", {})
+ text = content.get("text", "") if isinstance(content, dict) else (content if isinstance(content, str) else "")
+ stripped = text.strip()
+ return bool(stripped) and all(c in "-*_ " for c in stripped) and len(stripped.replace(" ", "")) >= 3
+
def _renderSectionToTextFrame(self, slide, section: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None:
"""Render a single section to a text frame."""
try:
@@ -1942,7 +1738,7 @@ JSON ONLY. NO OTHER TEXT."""
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
- section_type = section.get("content_type", "paragraph")
+ sectionType = section.get("content_type", "paragraph")
elements = section.get("elements", [])
if not elements:
@@ -1952,54 +1748,42 @@ JSON ONLY. NO OTHER TEXT."""
if not isinstance(element, dict):
continue
- element_type = element.get("type", "")
- if not element_type:
- element_type = section_type
-
- # Skip images - handled separately
- if element_type == "image":
+ elementType = element.get("type", "") or sectionType
+
+ if elementType == "image":
+ continue
+
+ # Skip horizontal rules (---, ***, ___)
+ if elementType == "paragraph" and self._isHorizontalRule(element):
continue
- if element_type == "bullet_list" or element_type == "list":
+ if elementType == "table":
+ self._addTableToSlide(slide, element, styles)
+ elif elementType in ("bullet_list", "list"):
self._addBulletListToSlide(slide, element, styles, text_frame, font_size_multiplier)
- elif element_type == "heading":
+ elif elementType == "heading":
self._addHeadingToSlide(slide, element, styles, text_frame, font_size_multiplier)
- elif element_type == "paragraph":
+ elif elementType == "paragraph":
self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier)
- elif element_type == "code_block" or element_type == "code":
+ elif elementType in ("code_block", "code"):
self._addCodeBlockToSlide(slide, element, styles, text_frame, font_size_multiplier)
- elif element_type == "extracted_text":
+ elif elementType == "extracted_text":
content = element.get("content", "")
- source = element.get("source", "")
if content:
- paragraph_style = styles.get("paragraph", {})
p = text_frame.add_paragraph()
- p.text = content
- base_font_size = paragraph_style.get("font_size", 18)
- p.font.size = Pt(int(base_font_size * font_size_multiplier))
- p.font.bold = paragraph_style.get("bold", False)
- p.font.color.rgb = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47))))
+ pStyle = styles.get("paragraph", {})
+ fSize = Pt(max(10, int(pStyle.get("font_size", 14) * font_size_multiplier)))
+ fColor = RGBColor(*self._getSafeColor(pStyle.get("color", (47, 47, 47))))
+ self._addMarkdownInlineRuns(p, content, fontSize=fSize, fontColor=fColor)
p.alignment = PP_ALIGN.LEFT
- if source:
- p.add_run(f" (Source: {source})").font.italic = True
- elif element_type == "reference":
+ elif elementType == "reference":
label = element.get("label", "Reference")
p = text_frame.add_paragraph()
p.text = f"[Reference: {label}]"
p.font.italic = True
p.alignment = PP_ALIGN.LEFT
else:
- # Fallback to paragraph
- content = element.get("content", "")
- if isinstance(content, dict):
- text = content.get("text", "")
- elif isinstance(content, str):
- text = content
- else:
- text = ""
-
- if text:
- self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier=1.0)
+ self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier)
except Exception as e:
logger.warning(f"Error rendering section to text frame: {str(e)}")
diff --git a/tests/unit/services/test_renderer_pdf_smoke.py b/tests/unit/services/test_renderer_pdf_smoke.py
new file mode 100644
index 00000000..a3a3a78d
--- /dev/null
+++ b/tests/unit/services/test_renderer_pdf_smoke.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""
+Smoke test: RendererPdf with every JSON section/element shape the pipeline supports.
+
+Canonical section types (datamodelJson.supportedSectionTypes): table, bullet_list, heading,
+paragraph, code_block, image.
+
+PDF renderer additionally handles element types: reference, extracted_text (Phase 5D).
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+
+from modules.serviceCenter.services.serviceGeneration.renderers.rendererPdf import (
+ REPORTLAB_AVAILABLE,
+ RendererPdf,
+ _normalizePdfMonospaceText,
+ _prepareCodeBlockPlainText,
+)
+
+# 1×1 transparent PNG
+_MIN_PNG_B64 = (
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8BQDwAEhQGAhKmMIQAAAABJRU5ErkJggg=="
+)
+
+
+def _fakeServices():
+ """RendererPdf calls services.utils.debugLogToFile; avoid None."""
+
+ def _noop(msg, tag=None):
+ pass
+
+ return SimpleNamespace(utils=SimpleNamespace(debugLogToFile=_noop))
+
+
+def _fullDocumentJson() -> dict:
+ """One document covering all supported content_type values plus reference/extracted_text elements."""
+ return {
+ "metadata": {
+ "split_strategy": "single_document",
+ "source_documents": [],
+ "extraction_method": "smoke_test",
+ "title": "PDF Renderer Smoke",
+ "language": "de",
+ },
+ "documents": [
+ {
+ "id": "doc_smoke",
+ "title": "PDF Renderer Smoke",
+ "filename": "pdf_renderer_smoke.pdf",
+ "sections": [
+ {
+ "id": "sec_h1",
+ "content_type": "heading",
+ "order": 1,
+ "elements": [
+ {
+ "content": {
+ "text": "H1 with **bold** and a very long subtitle line that should wrap cleanly without overlapping",
+ "level": 1,
+ }
+ }
+ ],
+ },
+ {
+ "id": "sec_h2",
+ "content_type": "heading",
+ "order": 2,
+ "elements": [{"content": {"text": "H2 *italic* and `inline code`", "level": 2}}],
+ },
+ {
+ "id": "sec_para",
+ "content_type": "paragraph",
+ "order": 3,
+ "elements": [
+ {
+ "content": {
+ "text": (
+ "Paragraph: **strong**, *emphasis*, __under-like bold__, "
+ "_single underscores_, and `var = 1`."
+ )
+ }
+ }
+ ],
+ },
+ {
+ "id": "sec_bullets",
+ "content_type": "bullet_list",
+ "order": 4,
+ "elements": [
+ {
+ "content": {
+ "items": [
+ "Bullet **one**",
+ {"text": "Bullet two with *italic*"},
+ ],
+ "list_type": "bullet",
+ }
+ }
+ ],
+ },
+ {
+ "id": "sec_numbered",
+ "content_type": "bullet_list",
+ "order": 5,
+ "elements": [
+ {
+ "content": {
+ "items": [{"text": "First numbered"}, {"text": "Second **numbered**"}],
+ "list_type": "numbered",
+ }
+ }
+ ],
+ },
+ {
+ "id": "sec_table",
+ "content_type": "table",
+ "order": 6,
+ "elements": [
+ {
+ "content": {
+ "headers": ["Col A", "Col B", "Col C"],
+ "rows": [
+ ["Short", "Medium length cell", "**Bold** in cell"],
+ ["R2", "Data", "`code`"],
+ ],
+ }
+ }
+ ],
+ },
+ {
+ "id": "sec_code",
+ "content_type": "code_block",
+ "order": 7,
+ "elements": [
+ {
+ "content": {
+ "language": "python",
+ "code": (
+ 'def hello():\n print(" & ampersand")\n return 42\n'
+ "\n# tree (Unicode box drawing must not produce tofu in PDF)\n"
+ "Reports/\n\u251c\u2500\u2500 2025/\n\u2502 \u2514\u2500\u2500 file.txt\n"
+ ),
+ }
+ }
+ ],
+ },
+ {
+ "id": "sec_image",
+ "content_type": "image",
+ "order": 8,
+ "elements": [
+ {
+ "content": {
+ "base64Data": _MIN_PNG_B64,
+ "altText": "Smoke pixel",
+ "caption": "Minimal PNG (1×1)",
+ }
+ }
+ ],
+ },
+ {
+ "id": "sec_reference",
+ "content_type": "paragraph",
+ "order": 9,
+ "elements": [
+ {
+ "type": "reference",
+ "label": "External spec",
+ "documentReference": "urn:smoke:ref",
+ }
+ ],
+ },
+ {
+ "id": "sec_extracted",
+ "content_type": "paragraph",
+ "order": 10,
+ "elements": [
+ {
+ "type": "extracted_text",
+ "content": "Extracted **body** with formatting.",
+ "source": "fixture/source.md",
+ }
+ ],
+ },
+ ],
+ }
+ ],
+ }
+
+
+@pytest.mark.asyncio
+async def test_renderer_pdf_all_json_elements(tmp_path):
+ if not REPORTLAB_AVAILABLE:
+ pytest.skip("reportlab is not installed")
+ renderer = RendererPdf(services=_fakeServices())
+ payload = _fullDocumentJson()
+ docs = await renderer.render(
+ extractedContent=payload,
+ title="PDF_Renderer_Smoke",
+ userPrompt=None,
+ aiService=None,
+ )
+ assert len(docs) == 1
+ out = docs[0]
+ assert out.mimeType == "application/pdf"
+ assert out.documentData[:4] == b"%PDF"
+ assert out.filename.endswith(".pdf")
+
+ outPath = tmp_path / "pdf_renderer_smoke.pdf"
+ outPath.write_bytes(out.documentData)
+ assert outPath.stat().st_size > 500
+
+
+def test_prepare_code_block_preserves_indentation_spaces():
+ raw = "def x():\n return 1\n two leading on line"
+ assert " return" in _prepareCodeBlockPlainText(raw)
+ assert "\t" not in _prepareCodeBlockPlainText("a\tb")
+
+
+def test_normalize_pdf_monospace_replaces_box_drawing():
+ raw = "\u2500\u2502\u251c\u2514\u252c\nReports/\n"
+ norm = _normalizePdfMonospaceText(raw)
+ assert "\u2500" not in norm
+ assert "\u2502" not in norm
+ assert "Reports/" in norm
+
+
+def test_pdf_heading_font_sizes_strictly_decrease():
+ """H3 must not fall back to H1 styles (previous bug: ## smaller than ###)."""
+ renderer = RendererPdf(services=_fakeServices())
+ styles = renderer._getDefaultStyleSet()
+ assert styles["heading1"]["font_size"] > styles["heading2"]["font_size"] > styles["heading3"]["font_size"]
+ assert renderer._defaultHeadingStyleDef(2)["font_size"] > renderer._defaultHeadingStyleDef(3)["font_size"]
+ if REPORTLAB_AVAILABLE:
+ s1 = renderer._createHeadingStyle(styles, 1).fontSize
+ s2 = renderer._createHeadingStyle(styles, 2).fontSize
+ s3 = renderer._createHeadingStyle(styles, 3).fontSize
+ assert s1 > s2 > s3
+ partial = {"heading1": styles["heading1"], "heading2": styles["heading2"]}
+ assert renderer._createHeadingStyle(partial, 3).fontSize < renderer._createHeadingStyle(partial, 2).fontSize
+
+
+def test_inline_code_angle_brackets_escaped_in_font_span():
+ """Paths like `...//` must not break ReportLab XML inside Courier."""
+ renderer = RendererPdf(services=_fakeServices())
+ xml = renderer._markdownInlineToReportlabXml("unter `Eingabe//` speichern")
+ assert 'name="Courier"' in xml
+ assert "<Slug>" in xml