Merge pull request #112 from valueonag/int

fixed rendering issues
This commit is contained in:
Patrick Motsch 2026-03-22 11:12:56 +01:00 committed by GitHub
commit d8cf4b993e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 1011 additions and 634 deletions

View file

@ -19,7 +19,11 @@ class DataSource(BaseModel):
connectionId: str = Field(description="FK to UserConnection") connectionId: str = Field(description="FK to UserConnection")
sourceType: str = Field(description="sharepointFolder, googleDriveFolder, outlookFolder, ftpFolder") sourceType: str = Field(description="sharepointFolder, googleDriveFolder, outlookFolder, ftpFolder")
path: str = Field(description="External path (e.g. '/sites/MySite/Documents/Reports')") path: str = Field(description="External path (e.g. '/sites/MySite/Documents/Reports')")
label: str = Field(description="User-visible label") label: str = Field(description="User-visible label (often the last path segment)")
displayPath: Optional[str] = Field(
default=None,
description="Human-readable full path for UI (connection-relative, slash-separated)",
)
featureInstanceId: Optional[str] = Field(default=None, description="Scoped to feature instance") featureInstanceId: Optional[str] = Field(default=None, description="Scoped to feature instance")
mandateId: Optional[str] = Field(default=None, description="Mandate scope") mandateId: Optional[str] = Field(default=None, description="Mandate scope")
userId: str = Field(default="", description="Owner user ID") userId: str = Field(default="", description="Owner user ID")
@ -37,6 +41,7 @@ registerModelLabels(
"sourceType": {"en": "Source Type", "de": "Quellentyp", "fr": "Type de source"}, "sourceType": {"en": "Source Type", "de": "Quellentyp", "fr": "Type de source"},
"path": {"en": "Path", "de": "Pfad", "fr": "Chemin"}, "path": {"en": "Path", "de": "Pfad", "fr": "Chemin"},
"label": {"en": "Label", "de": "Bezeichnung", "fr": "Libellé"}, "label": {"en": "Label", "de": "Bezeichnung", "fr": "Libellé"},
"displayPath": {"en": "Display path", "de": "Anzeigepfad", "fr": "Chemin affiché"},
"featureInstanceId": {"en": "Feature Instance", "de": "Feature-Instanz", "fr": "Instance de fonctionnalité"}, "featureInstanceId": {"en": "Feature Instance", "de": "Feature-Instanz", "fr": "Instance de fonctionnalité"},
"mandateId": {"en": "Mandate ID", "de": "Mandanten-ID", "fr": "ID du mandat"}, "mandateId": {"en": "Mandate ID", "de": "Mandanten-ID", "fr": "ID du mandat"},
"userId": {"en": "User ID", "de": "Benutzer-ID", "fr": "ID utilisateur"}, "userId": {"en": "User ID", "de": "Benutzer-ID", "fr": "ID utilisateur"},

View file

@ -1139,6 +1139,7 @@ class CreateDataSourceRequest(BaseModel):
sourceType: str = Field(description="Source type") sourceType: str = Field(description="Source type")
path: str = Field(description="Path") path: str = Field(description="Path")
label: str = Field(description="Label") label: str = Field(description="Label")
displayPath: Optional[str] = Field(default=None, description="Full human-readable path for tooltips")
@router.post("/{instanceId}/datasources") @router.post("/{instanceId}/datasources")
@ -1165,6 +1166,7 @@ async def createWorkspaceDataSource(
path=body.path, path=body.path,
label=body.label, label=body.label,
featureInstanceId=instanceId, featureInstanceId=instanceId,
displayPath=body.displayPath,
) )
return JSONResponse(dataSource if isinstance(dataSource, dict) else dataSource.model_dump()) return JSONResponse(dataSource if isinstance(dataSource, dict) else dataSource.model_dump())
@ -1214,7 +1216,7 @@ async def listFeatureConnections(
userMandates = rootIf.getUserMandates(userId) userMandates = rootIf.getUserMandates(userId)
if not userMandates: if not userMandates:
return JSONResponse({"featureConnections": []}) return JSONResponse({"featureConnectionsByMandate": []})
mandateLabels: dict = {} mandateLabels: dict = {}
for um in userMandates: for um in userMandates:
@ -1226,7 +1228,7 @@ async def listFeatureConnections(
except Exception: except Exception:
mandateLabels[um.mandateId] = um.mandateId mandateLabels[um.mandateId] = um.mandateId
items = [] byMandate: dict = {}
seenIds: set = set() seenIds: set = set()
for um in userMandates: for um in userMandates:
allInstances = rootIf.getFeatureInstancesByMandate(um.mandateId) allInstances = rootIf.getFeatureInstancesByMandate(um.mandateId)
@ -1244,20 +1246,33 @@ async def listFeatureConnections(
featureDef = catalog.getFeatureDefinition(inst.featureCode) or {} featureDef = catalog.getFeatureDefinition(inst.featureCode) or {}
dataObjects = catalog.getDataObjects(inst.featureCode) dataObjects = catalog.getDataObjects(inst.featureCode)
mLabel = mandateLabels.get(inst.mandateId, "")
label = inst.label or inst.featureCode label = inst.label or inst.featureCode
if mLabel: mid = inst.mandateId
label = f"{label} ({mLabel})" connItem = {
items.append({
"featureInstanceId": inst.id, "featureInstanceId": inst.id,
"featureCode": inst.featureCode, "featureCode": inst.featureCode,
"mandateId": inst.mandateId, "mandateId": mid,
"label": label, "label": label,
"icon": featureDef.get("icon", "mdi-database"), "icon": featureDef.get("icon", "mdi-database"),
"tableCount": len(dataObjects), "tableCount": len(dataObjects),
}) }
if mid not in byMandate:
byMandate[mid] = []
byMandate[mid].append(connItem)
return JSONResponse({"featureConnections": items}) def _sortKeyLabel(x: dict) -> str:
return (x.get("label") or "").lower()
groups = []
for mid in sorted(byMandate.keys(), key=lambda m: (mandateLabels.get(m, m) or "").lower()):
conns = sorted(byMandate[mid], key=_sortKeyLabel)
groups.append({
"mandateId": mid,
"mandateLabel": mandateLabels.get(mid, mid),
"featureConnections": conns,
})
return JSONResponse({"featureConnectionsByMandate": groups})
@router.get("/{instanceId}/feature-connections/{fiId}/tables") @router.get("/{instanceId}/feature-connections/{fiId}/tables")

View file

@ -332,6 +332,7 @@ class AiObjects:
errorCount=0, errorCount=0,
toolCalls=responseToolCalls toolCalls=responseToolCalls
) )
response._modelMaxTokens = model.maxTokens
if self.billingCallback: if self.billingCallback:
try: try:
@ -470,6 +471,7 @@ class AiObjects:
errorCount=0, errorCount=0,
toolCalls=responseToolCalls, toolCalls=responseToolCalls,
) )
response._modelMaxTokens = model.maxTokens
if self.billingCallback: if self.billingCallback:
try: try:

View file

@ -276,6 +276,7 @@ async def runAgentLoop(
"userId": userId, "userId": userId,
"featureInstanceId": featureInstanceId, "featureInstanceId": featureInstanceId,
"mandateId": mandateId, "mandateId": mandateId,
"modelMaxOutputTokens": getattr(aiResponse, "_modelMaxTokens", None) or 0,
}) })
state.totalToolCalls += len(results) state.totalToolCalls += len(results)
@ -439,6 +440,29 @@ def _repairTruncatedJson(raw: str) -> Optional[Dict[str, Any]]:
return None return None
def _validateRepairedToolArgs(toolName: str, args: Dict[str, Any]) -> Optional[str]:
"""After closeJsonStructures + json.loads, args can be syntactically valid but useless (truncation
cut off before required fields). Return a user-facing _parseError message, or None if OK.
Without this, renderDocument runs with missing `content` and only returns \"content is required\",
hiding the real cause (output token limit).
"""
if toolName == "renderDocument":
content = args.get("content")
sourceFileId = args.get("sourceFileId")
hasInline = isinstance(content, str) and bool(content.strip())
hasFile = isinstance(sourceFileId, str) and bool(sourceFileId.strip())
if not hasInline and not hasFile:
return (
"Your tool call JSON was repaired after truncation, but neither `content` nor `sourceFileId` is usable. "
"Large documents must not be inlined in the tool call (output limit).\n"
"Preferred: writeFile(mode='create') + writeFile(mode='append') to build a .md file, then "
"renderDocument(sourceFileId=<that file id>, outputFormat='pdf', title='...') — the tool call stays small.\n"
"Alternatives: replaceInFile for edits; shorter outline first."
)
return None
def _parseToolCalls(aiResponse: AiCallResponse) -> List[ToolCallRequest]: def _parseToolCalls(aiResponse: AiCallResponse) -> List[ToolCallRequest]:
"""Parse tool calls from AI response. Supports native function calling and text-based fallback.""" """Parse tool calls from AI response. Supports native function calling and text-based fallback."""
toolCalls = [] toolCalls = []
@ -457,14 +481,20 @@ def _parseToolCalls(aiResponse: AiCallResponse) -> List[ToolCallRequest]:
logger.warning(f"Unrecoverable truncated JSON for '{tc['function']['name']}': {rawArgs[:200]}") logger.warning(f"Unrecoverable truncated JSON for '{tc['function']['name']}': {rawArgs[:200]}")
parsedArgs = {"_parseError": ( parsedArgs = {"_parseError": (
"Your tool call arguments were truncated (output cut off by token limit). " "Your tool call arguments were truncated (output cut off by token limit). "
"The content is too large for a single tool call. Strategies:\n" "Do not put the full document body in renderDocument JSON.\n"
"1. For new files: use writeFile(mode='create') with the first part, " "1. writeFile(create) + writeFile(append) to a .md file, then "
"then writeFile(fileId=..., mode='append') for subsequent parts (~8000 chars each).\n" "renderDocument(sourceFileId=<file id>, outputFormat=..., title=...) — tiny tool call.\n"
"2. For editing existing files: use replaceInFile to change only the specific parts.\n" "2. Or replaceInFile for targeted edits.\n"
"3. For documentation: split into multiple smaller files." "3. Or split into multiple smaller files."
)} )}
else: else:
logger.info(f"Repaired truncated JSON for '{tc['function']['name']}'") logger.info(f"Repaired truncated JSON for '{tc['function']['name']}'")
repairIssue = _validateRepairedToolArgs(tc["function"]["name"], parsedArgs)
if repairIssue:
logger.warning(
f"Repaired JSON for '{tc['function']['name']}' still invalid for execution: {repairIssue[:80]}..."
)
parsedArgs = {"_parseError": repairIssue}
else: else:
parsedArgs = rawArgs if rawArgs else {} parsedArgs = rawArgs if rawArgs else {}
toolCalls.append(ToolCallRequest( toolCalls.append(ToolCallRequest(

View file

@ -259,7 +259,9 @@ class AgentService:
"Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, " "Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, "
"or `describeImage(fileId)` for image analysis.\n" "or `describeImage(fileId)` for image analysis.\n"
"For folders, use `listFiles(folderId)` to get the files inside, then `readFile(fileId)` for each.\n" "For folders, use `listFiles(folderId)` to get the files inside, then `readFile(fileId)` for each.\n"
"When generating documents with `renderDocument`, embed images using `![alt text](file:fileId)` in the markdown content.\n\n" "For large PDFs/DOCX, avoid huge `renderDocument` tool JSON: build markdown with "
"`writeFile` (create + append), then `renderDocument(sourceFileId=that file id, outputFormat=...)`.\n"
"For small docs you may pass `content` inline. Embed images with `![alt](file:fileId)` in markdown.\n\n"
) )
header += "\n\n".join(fileDescriptions) header += "\n\n".join(fileDescriptions)
return f"{header}\n\n---\n\nUser request: {prompt}" return f"{header}\n\n---\n\nUser request: {prompt}"
@ -2209,13 +2211,75 @@ def _registerCoreTools(registry: ToolRegistry, services):
async def _renderDocument(args: Dict[str, Any], context: Dict[str, Any]): async def _renderDocument(args: Dict[str, Any], context: Dict[str, Any]):
"""Render agent-produced markdown content into any document format via the RendererRegistry.""" """Render agent-produced markdown content into any document format via the RendererRegistry."""
import re as _re import re as _re
sourceFileId = (args.get("sourceFileId") or "").strip()
content = args.get("content", "") content = args.get("content", "")
if not isinstance(content, str):
content = str(content) if content is not None else ""
outputFormat = args.get("outputFormat", "pdf") outputFormat = args.get("outputFormat", "pdf")
title = args.get("title", "Document") title = args.get("title", "Document")
language = args.get("language", "de") language = args.get("language", "de")
if not content: if sourceFileId:
return ToolResult(toolCallId="", toolName="renderDocument", success=False, error="content is required") try:
dbMgmt = services.chat.interfaceDbComponent
fileRow = dbMgmt.getFile(sourceFileId)
if not fileRow:
return ToolResult(
toolCallId="",
toolName="renderDocument",
success=False,
error=f"sourceFileId not found: {sourceFileId}",
)
rawBytes = dbMgmt.getFileData(sourceFileId)
if not rawBytes:
return ToolResult(
toolCallId="",
toolName="renderDocument",
success=False,
error=f"sourceFileId has no data: {sourceFileId}",
)
try:
content = rawBytes.decode("utf-8")
except UnicodeDecodeError:
content = rawBytes.decode("latin-1", errors="replace")
except Exception as e:
return ToolResult(
toolCallId="",
toolName="renderDocument",
success=False,
error=f"Could not read sourceFileId: {e}",
)
if not (content or "").strip():
return ToolResult(
toolCallId="",
toolName="renderDocument",
success=False,
error=(
"Provide non-empty `content` (markdown) or `sourceFileId` (id of a .md/.txt from writeFile). "
"For long documents use writeFile create+append, then renderDocument(sourceFileId=...)."
),
)
modelMaxTokens = context.get("modelMaxOutputTokens", 0)
_inlineCharLimit = int(modelMaxTokens * 3 * 0.5) if modelMaxTokens > 0 else 6000
_inlineCharLimit = max(_inlineCharLimit, 3000)
if not sourceFileId and len(content) > _inlineCharLimit:
return ToolResult(
toolCallId="",
toolName="renderDocument",
success=False,
error=(
f"Inline `content` is {len(content)} chars — over the {_inlineCharLimit} char limit "
f"(derived from model output budget of {modelMaxTokens} tokens). "
"Large documents must use the file path:\n"
"1. writeFile(mode='create', name='draft.md', content=<first ~5000 chars>)\n"
"2. writeFile(mode='append', fileId=<id>, content=<next chunk>) — repeat as needed\n"
"3. renderDocument(sourceFileId=<id>, outputFormat='pdf', title='...')\n"
"This avoids output truncation entirely."
),
)
try: try:
structuredContent = _markdownToDocumentJson(content, title, language) structuredContent = _markdownToDocumentJson(content, title, language)
@ -2321,20 +2385,26 @@ def _registerCoreTools(registry: ToolRegistry, services):
registry.register( registry.register(
"renderDocument", _renderDocument, "renderDocument", _renderDocument,
description=( description=(
"Render markdown content into a document file (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT). " "Render markdown into a document file (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT). "
"You write the full document content as markdown, then this tool converts and renders it. " "For long documents: write markdown with writeFile (mode=create then append chunks), then call this tool with "
"To embed images from uploaded files, use markdown image syntax with the file ID: ![alt text](file:fileId). " "`sourceFileId` only (tiny JSON — avoids model output truncation). For short docs you may pass `content` inline. "
"The images will be resolved from the Knowledge Store and embedded in the output document." "Images: ![alt text](file:fileId) in the markdown."
), ),
parameters={ parameters={
"type": "object", "type": "object",
"properties": { "properties": {
"content": {"type": "string", "description": "Full document content as markdown (headings, tables, lists, code blocks, paragraphs, images via ![alt](file:fileId))"}, "content": {
"type": "string",
"description": "Full markdown inline. Prefer `sourceFileId` when the document is large (many KB).",
},
"sourceFileId": {
"type": "string",
"description": "Chat file id of markdown saved via writeFile (create+append). Use this instead of `content` for long PDFs.",
},
"outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"}, "outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"},
"title": {"type": "string", "description": "Document title", "default": "Document"}, "title": {"type": "string", "description": "Document title", "default": "Document"},
"language": {"type": "string", "description": "Document language (ISO 639-1)", "default": "de"}, "language": {"type": "string", "description": "Document language (ISO 639-1)", "default": "de"},
}, },
"required": ["content"],
}, },
readOnly=False, readOnly=False,
) )

View file

@ -508,7 +508,7 @@ class ChatService:
def createDataSource( def createDataSource(
self, connectionId: str, sourceType: str, path: str, label: str, self, connectionId: str, sourceType: str, path: str, label: str,
featureInstanceId: str = None featureInstanceId: str = None, displayPath: str = None,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Create a new external data source reference.""" """Create a new external data source reference."""
from modules.datamodels.datamodelDataSource import DataSource from modules.datamodels.datamodelDataSource import DataSource
@ -517,6 +517,7 @@ class ChatService:
sourceType=sourceType, sourceType=sourceType,
path=path, path=path,
label=label, label=label,
displayPath=displayPath,
featureInstanceId=featureInstanceId or self._context.feature_instance_id or "", featureInstanceId=featureInstanceId or self._context.feature_instance_id or "",
mandateId=self._context.mandate_id or "", mandateId=self._context.mandate_id or "",
userId=self.user.id if self.user else "", userId=self.user.id if self.user else "",

View file

@ -281,7 +281,7 @@ class RendererDocx(BaseRenderer):
def _getDefaultStyleSet(self) -> Dict[str, Any]: def _getDefaultStyleSet(self) -> Dict[str, Any]:
"""Default DOCX style set - used when no style instructions present.""" """Default DOCX style set - used when no style instructions present."""
return { return {
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"}, "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "left"},
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"}, "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"},
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"}, "heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"},
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"}, "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"},
@ -349,11 +349,11 @@ class RendererDocx(BaseRenderer):
para.runs[0].italic = True para.runs[0].italic = True
continue continue
elif element_type == "extracted_text": elif element_type == "extracted_text":
# Extracted text format - render as paragraph
content = element.get("content", "") content = element.get("content", "")
source = element.get("source", "") source = element.get("source", "")
if content: if content:
para = doc.add_paragraph(content) para = doc.add_paragraph()
self._addMarkdownInlineRuns(para, content)
if source: if source:
para.add_run(f" (Source: {source})").italic = True para.add_run(f" (Source: {source})").italic = True
continue continue
@ -406,6 +406,37 @@ class RendererDocx(BaseRenderer):
# Add error paragraph as fallback # Add error paragraph as fallback
error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]") error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]")
# ── Markdown inline → python-docx runs ──────────────────────────────
_MD_INLINE_RE = re.compile(
r"(\*\*(.+?)\*\*)" # group 1,2: bold
r"|(__(.+?)__)" # group 3,4: bold (underscore)
r"|(?<!\*)\*([^*\n]+?)\*(?!\*)" # group 5: italic
r"|(?<![\w/])_([^_\n]+?)_(?![\w/])" # group 6: italic (underscore)
r"|`([^`]+)`" # group 7: inline code
)
def _addMarkdownInlineRuns(self, paragraph, text: str) -> None:
"""Parse markdown inline formatting and add corresponding Runs to a python-docx paragraph."""
pos = 0
for m in self._MD_INLINE_RE.finditer(text):
if m.start() > pos:
paragraph.add_run(text[pos:m.start()])
if m.group(2):
paragraph.add_run(m.group(2)).bold = True
elif m.group(4):
paragraph.add_run(m.group(4)).bold = True
elif m.group(5):
paragraph.add_run(m.group(5)).italic = True
elif m.group(6):
paragraph.add_run(m.group(6)).italic = True
elif m.group(7):
run = paragraph.add_run(m.group(7))
run.font.name = "Courier New"
run.font.size = Pt(9)
pos = m.end()
if pos < len(text):
paragraph.add_run(text[pos:])
def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None: def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
""" """
Render a JSON table to DOCX using AI-generated styles. Render a JSON table to DOCX using AI-generated styles.
@ -480,9 +511,8 @@ class RendererDocx(BaseRenderer):
tblW.set(qn('w:w'), '0') tblW.set(qn('w:w'), '0')
tblPr.append(tblW) tblPr.append(tblW)
# Center alignment
jc = OxmlElement('w:jc') jc = OxmlElement('w:jc')
jc.set(qn('w:val'), 'center') jc.set(qn('w:val'), 'left')
tblPr.append(jc) tblPr.append(jc)
# Apply table borders directly (works without template styles) # Apply table borders directly (works without template styles)
@ -821,10 +851,11 @@ class RendererDocx(BaseRenderer):
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16)) text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
for item in items: for item in items:
if isinstance(item, str): itemText = item if isinstance(item, str) else (item.get("text", "") if isinstance(item, dict) else "")
para = doc.add_paragraph(item, style='List Bullet') if not itemText:
elif isinstance(item, dict) and "text" in item: continue
para = doc.add_paragraph(item["text"], style='List Bullet') para = doc.add_paragraph(style='List Bullet')
self._addMarkdownInlineRuns(para, itemText)
# Apply bullet list styling from style set - use cached objects # Apply bullet list styling from style set - use cached objects
if bullet_style and para.runs: if bullet_style and para.runs:
@ -849,7 +880,6 @@ class RendererDocx(BaseRenderer):
def _renderJsonHeading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None: def _renderJsonHeading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON heading to DOCX using AI-generated styles.""" """Render a JSON heading to DOCX using AI-generated styles."""
try: try:
# Extract from nested content structure
content = heading_data.get("content", {}) content = heading_data.get("content", {})
if not isinstance(content, dict): if not isinstance(content, dict):
return return
@ -858,13 +888,13 @@ class RendererDocx(BaseRenderer):
if text: if text:
level = max(1, min(6, level)) level = max(1, min(6, level))
# Use custom heading style if available, otherwise use built-in # python-docx supports Heading 1 Heading 9 as built-in styles
style_name = f"Heading {level}" if level <= 2 else "Heading 1"
try: try:
para = doc.add_paragraph(text, style=style_name) para = doc.add_heading("", level=level)
except KeyError: para.clear()
# Fallback to built-in heading if custom style doesn't exist self._addMarkdownInlineRuns(para, text)
doc.add_heading(text, level=level) except (KeyError, ValueError):
para = doc.add_paragraph(text)
except Exception as e: except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}") self.logger.warning(f"Error rendering heading: {str(e)}")
@ -893,8 +923,8 @@ class RendererDocx(BaseRenderer):
return return
if text: if text:
para = doc.add_paragraph(text) para = doc.add_paragraph()
# Apply paragraph styling from style set - OPTIMIZED: pre-calculate style objects self._addMarkdownInlineRuns(para, text)
paragraph_style = styles.get("paragraph", {}) paragraph_style = styles.get("paragraph", {})
if paragraph_style: if paragraph_style:
# Pre-calculate and cache style objects # Pre-calculate and cache style objects
@ -1345,7 +1375,7 @@ class RendererDocx(BaseRenderer):
# Create table # Create table
table = doc.add_table(rows=len(table_data), cols=len(table_data[0])) table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
table.alignment = WD_TABLE_ALIGNMENT.CENTER table.alignment = WD_TABLE_ALIGNMENT.LEFT
# Add data to table # Add data to table
for row_idx, row_data in enumerate(table_data): for row_idx, row_data in enumerate(table_data):

View file

@ -4,6 +4,10 @@
PDF renderer for report generation using reportlab. PDF renderer for report generation using reportlab.
""" """
from __future__ import annotations
import unicodedata
from .documentRendererBaseTemplate import BaseRenderer from .documentRendererBaseTemplate import BaseRenderer
from modules.datamodels.datamodelDocument import RenderedDocument from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional from typing import Dict, Any, List, Optional
@ -11,8 +15,8 @@ import io
import base64 import base64
try: try:
from reportlab.lib.pagesizes import letter, A4 from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Preformatted
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch from reportlab.lib.units import inch
from reportlab.lib import colors from reportlab.lib import colors
@ -21,6 +25,53 @@ try:
except ImportError: except ImportError:
REPORTLAB_AVAILABLE = False REPORTLAB_AVAILABLE = False
import re as _re_pdf
# A4 width in pt; margins must match SimpleDocTemplate(leftMargin/rightMargin)
_PDF_MARGIN_LR_PT = 72.0
_PDF_A4_WIDTH_PT = 595.27
_PDF_CONTENT_WIDTH_PT = _PDF_A4_WIDTH_PT - (2 * _PDF_MARGIN_LR_PT)
def _boxDrawingCharToAscii(ch: str) -> str:
"""Map one box-drawing character to ASCII (Courier has no glyphs for U+2500U+257F)."""
nm = unicodedata.name(ch, "")
v = "VERTICAL" in nm
h = "HORIZONTAL" in nm
and_ = "AND" in nm
if v and h:
return "+"
if v and not h and not and_:
return "|"
if h and not v and not and_:
return "-"
return "+"
def _normalizePdfMonospaceText(text: str) -> str:
"""Replace Unicode box/block drawing with ASCII so PDF core fonts render readable code/trees."""
if not text:
return ""
out: List[str] = []
for ch in text:
o = ord(ch)
if 0x2500 <= o <= 0x257F:
out.append(_boxDrawingCharToAscii(ch))
elif 0x2580 <= o <= 0x259F:
out.append("#")
else:
out.append(ch)
return "".join(out)
def _prepareCodeBlockPlainText(text: str) -> str:
"""Normalize newlines/tabs for preformatted code (no HTML/XML; spaces must stay significant)."""
if not text:
return ""
text = text.replace("\r\n", "\n").replace("\r", "\n")
return text.expandtabs(4)
class RendererPdf(BaseRenderer): class RendererPdf(BaseRenderer):
"""Renders content to PDF format using reportlab.""" """Renders content to PDF format using reportlab."""
@ -122,15 +173,6 @@ class RendererPdf(BaseRenderer):
# Extract sections and metadata from standardized schema # Extract sections and metadata from standardized schema
sections = self._extractSections(json_content) sections = self._extractSections(json_content)
metadata = self._extractMetadata(json_content)
# Use provided title (which comes from documents[].title) as primary source
# Fallback to metadata.title only if title parameter is empty
document_title = title if title else metadata.get("title", "Generated Document")
# Make title shorter to prevent wrapping/overlapping
if len(document_title) > 40:
document_title = "PowerOn - Consent Agreement"
# Create a buffer to hold the PDF # Create a buffer to hold the PDF
buffer = io.BytesIO() buffer = io.BytesIO()
@ -145,17 +187,9 @@ class RendererPdf(BaseRenderer):
bottomMargin=18 bottomMargin=18
) )
# Build PDF content # Build PDF content (no cover page — body starts on page 1; filename still uses `title`)
story = [] story = []
# Title page
title_style = self._createTitleStyle(styles)
story.append(Paragraph(document_title, title_style))
story.append(Spacer(1, 50)) # Increased spacing to prevent overlap
story.append(Paragraph(f"Generated: {self._formatTimestamp()}", self._createNormalStyle(styles)))
story.append(Spacer(1, 30)) # Add spacing before page break
story.append(PageBreak())
# Process each section (sections already extracted above) # Process each section (sections already extracted above)
self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER") self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER")
for i, section in enumerate(sections): for i, section in enumerate(sections):
@ -164,10 +198,9 @@ class RendererPdf(BaseRenderer):
self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER") self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER")
story.extend(section_elements) story.extend(section_elements)
# Build PDF # Build PDF — retry with oversized flowables removed on LayoutError
doc.build(story) self._buildPdfWithOverflowGuard(doc, story, buffer)
# Get PDF content as base64
buffer.seek(0) buffer.seek(0)
pdf_bytes = buffer.getvalue() pdf_bytes = buffer.getvalue()
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
@ -177,6 +210,42 @@ class RendererPdf(BaseRenderer):
except Exception as e: except Exception as e:
self.logger.error(f"Error generating PDF from JSON: {str(e)}") self.logger.error(f"Error generating PDF from JSON: {str(e)}")
raise Exception(f"PDF generation failed: {str(e)}") raise Exception(f"PDF generation failed: {str(e)}")
def _buildPdfWithOverflowGuard(self, doc, story: List[Any], buffer) -> None:
"""Try doc.build(); on 'too large on page' LayoutError, drop the offending
flowable, log a warning, and retry (up to 5 times)."""
maxRetries = 5
for attempt in range(maxRetries + 1):
try:
buffer.seek(0)
buffer.truncate()
doc.build(story)
return
except Exception as e:
msg = str(e)
if "too large on page" not in msg or attempt == maxRetries:
raise
# Identify the offending flowable from the error repr
self.logger.warning(f"PDF overflow (attempt {attempt + 1}): {msg} — removing oversized element and retrying")
removed = False
for idx, flowable in enumerate(story):
fRepr = repr(flowable)
if "Table" in fRepr and hasattr(flowable, '_cellvalues'):
try:
nRows = len(flowable._cellvalues)
nCols = len(flowable._cellvalues[0]) if flowable._cellvalues else 0
if nRows == 1 and nCols == 1:
errPara = Paragraph(
"[Code block omitted — content too large for PDF page]",
self._createNormalStyle({}),
)
story[idx] = errPara
removed = True
break
except Exception:
pass
if not removed:
raise
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]: async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
"""Get style set - use styles from document generation metadata if available, """Get style set - use styles from document generation metadata if available,
@ -269,13 +338,18 @@ class RendererPdf(BaseRenderer):
"""Default PDF style set - used when no style instructions present.""" """Default PDF style set - used when no style instructions present."""
return { return {
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30}, "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center", "space_after": 30},
# Markdown #..###### — sizes must strictly decrease (H1 largest … H6 smallest).
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12}, "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 12, "space_before": 12},
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8}, "heading2": {"font_size": 15, "color": "#2F2F2F", "bold": True, "align": "left", "space_after": 10, "space_before": 10},
"heading3": {"font_size": 13, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 8, "space_before": 8},
"heading4": {"font_size": 12, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 6, "space_before": 6},
"heading5": {"font_size": 11, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 6, "space_before": 6},
"heading6": {"font_size": 10, "color": "#4F4F4F", "bold": True, "align": "left", "space_after": 4, "space_before": 4},
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2}, "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left", "space_after": 6, "line_height": 1.2},
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center", "font_size": 12}, "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "left", "font_size": 12},
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10}, "table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left", "font_size": 10},
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3}, "bullet_list": {"font_size": 11, "color": "#2F2F2F", "space_after": 3},
"code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6} "code_block": {"font": "Courier", "font_size": 9, "color": "#2F2F2F", "background": "#F5F5F5", "space_after": 6, "align": "left"}
} }
async def _getAiStylesWithPdfColors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]: async def _getAiStylesWithPdfColors(self, ai_service, style_template: str, default_styles: Dict[str, Any]) -> Dict[str, Any]:
@ -441,39 +515,35 @@ class RendererPdf(BaseRenderer):
return color_value return color_value
return default return default
def _defaultHeadingStyleDef(self, level: int) -> Dict[str, Any]:
def _createTitleStyle(self, styles: Dict[str, Any]) -> ParagraphStyle: """When heading{N} is missing from styles, never fall back to heading1 (that made H3 > H2)."""
"""Create title style from style definitions.""" sizes = {1: 18, 2: 15, 3: 13, 4: 12, 5: 11, 6: 10}
title_style_def = styles.get("title", {}) fs = sizes.get(level, 10)
sb = max(4, 14 - level)
# DEBUG: Show what color and spacing is being used for title return {
title_color = title_style_def.get("color", "#1F4E79") "font_size": fs,
title_space_after = title_style_def.get("space_after", 30) "color": "#2F2F2F" if level <= 2 else "#4F4F4F",
self.services.utils.debugLogToFile(f"PDF TITLE COLOR: {title_color} -> {self._hexToColor(title_color)}", "PDF_RENDERER") "bold": True,
self.services.utils.debugLogToFile(f"PDF TITLE SPACE_AFTER: {title_space_after}", "PDF_RENDERER") "align": "left",
"space_after": sb,
return ParagraphStyle( "space_before": sb,
'CustomTitle', }
fontSize=title_style_def.get("font_size", 20), # Reduced from 24 to 20
spaceAfter=title_style_def.get("space_after", 30),
alignment=self._getAlignment(title_style_def.get("align", "center")),
textColor=self._hexToColor(title_color),
leading=title_style_def.get("font_size", 20) * 1.4, # Add line spacing for multi-line titles
spaceBefore=0 # Ensure no space before title
)
def _createHeadingStyle(self, styles: Dict[str, Any], level: int) -> ParagraphStyle: def _createHeadingStyle(self, styles: Dict[str, Any], level: int) -> ParagraphStyle:
"""Create heading style from style definitions.""" """Create heading style from style definitions."""
heading_key = f"heading{level}" heading_key = f"heading{level}"
heading_style_def = styles.get(heading_key, styles.get("heading1", {})) heading_style_def = styles.get(heading_key) or self._defaultHeadingStyleDef(level)
fs = heading_style_def.get("font_size", self._defaultHeadingStyleDef(level)["font_size"])
bold = heading_style_def.get("bold", True)
return ParagraphStyle( return ParagraphStyle(
f'CustomHeading{level}', f'CustomHeading{level}',
fontSize=heading_style_def.get("font_size", 18 - level * 2), fontName="Helvetica-Bold" if bold else "Helvetica",
fontSize=fs,
spaceAfter=heading_style_def.get("space_after", 12), spaceAfter=heading_style_def.get("space_after", 12),
spaceBefore=heading_style_def.get("space_before", 12), spaceBefore=heading_style_def.get("space_before", 12),
alignment=self._getAlignment(heading_style_def.get("align", "left")), alignment=self._getAlignment(heading_style_def.get("align", "left")),
textColor=self._hexToColor(heading_style_def.get("color", "#2F2F2F")) textColor=self._hexToColor(heading_style_def.get("color", "#2F2F2F")),
leading=fs * 1.35,
) )
def _createNormalStyle(self, styles: Dict[str, Any]) -> ParagraphStyle: def _createNormalStyle(self, styles: Dict[str, Any]) -> ParagraphStyle:
@ -505,22 +575,6 @@ class RendererPdf(BaseRenderer):
} }
return align_map.get(align.lower().strip(), TA_LEFT) return align_map.get(align.lower().strip(), TA_LEFT)
def _getTableAlignment(self, align: str) -> str:
"""Convert alignment string to ReportLab table alignment string."""
if not align or not isinstance(align, str):
return 'LEFT'
align_map = {
"center": 'CENTER',
"left": 'LEFT',
"justify": 'LEFT', # Tables don't support justify, use LEFT
"right": 'RIGHT',
"0": 'LEFT', # Handle numeric strings
"1": 'CENTER',
"2": 'LEFT' # Tables don't support justify, use LEFT
}
return align_map.get(align.lower().strip(), 'LEFT')
def _hexToColor(self, hex_color: str) -> colors.Color: def _hexToColor(self, hex_color: str) -> colors.Color:
"""Convert hex color to reportlab color.""" """Convert hex color to reportlab color."""
try: try:
@ -542,7 +596,66 @@ class RendererPdf(BaseRenderer):
return colors.black return colors.black
except: except:
return colors.black return colors.black
def _escapeReportlabXml(self, text: str) -> str:
"""Escape text for ReportLab Paragraph markup."""
if not text:
return ""
return (
text.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
)
def _applyInlineMarkdownToEscapedPlain(self, text: str) -> str:
"""Escape XML then apply bold/italic to a segment with no `code` spans (code is handled separately)."""
if not text:
return ""
s = self._escapeReportlabXml(text)
s = _re_pdf.sub(r"\*\*(.+?)\*\*", r"<b>\1</b>", s, flags=_re_pdf.DOTALL)
s = _re_pdf.sub(r"__(.+?)__", r"<b>\1</b>", s, flags=_re_pdf.DOTALL)
s = _re_pdf.sub(r"(?<!\*)\*([^*\n]+?)\*(?!\*)", r"<i>\1</i>", s)
s = _re_pdf.sub(r"(?<![\w/])_([^_\n]+?)_(?![\w/])", r"<i>\1</i>", s)
return s
def _markdownInlineToReportlabXml(self, text: str) -> str:
"""Turn common markdown inline (**bold**, *italic*, `code`) into ReportLab XML.
Backtick spans are extracted first so paths like `.../<Slug>/...` are not corrupted by
markdown patterns and XML escaping stays well-formed inside <font name=\"Courier\">.
"""
if not text:
return ""
text = _normalizePdfMonospaceText(text)
out: List[str] = []
pos = 0
for m in _re_pdf.finditer(r"`([^`]*)`", text):
before = text[pos:m.start()]
out.append(self._applyInlineMarkdownToEscapedPlain(before))
code = m.group(1)
out.append(f'<font name="Courier">{self._escapeReportlabXml(code)}</font>')
pos = m.end()
out.append(self._applyInlineMarkdownToEscapedPlain(text[pos:]))
return "".join(out)
def _paragraphFromInlineMarkdown(self, text: str, style: ParagraphStyle) -> Paragraph:
return Paragraph(self._markdownInlineToReportlabXml(text), style)
def _createTableCellParagraphStyle(
self, styles: Dict[str, Any], *, header: bool, tableStyleKey: str
) -> ParagraphStyle:
"""Paragraph style for table cells (word wrap within colWidth)."""
tdef = styles.get(tableStyleKey, {})
fs = tdef.get("font_size", 12 if header else 10)
defaultTc = "#FFFFFF" if header else "#2F2F2F"
return ParagraphStyle(
f"TblCell{'H' if header else 'B'}{tableStyleKey}",
fontSize=fs,
leading=fs * 1.25,
alignment=TA_LEFT,
textColor=self._hexToColor(tdef.get("text_color", defaultTc)),
fontName="Helvetica-Bold" if header and tdef.get("bold", True) else "Helvetica",
)
def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a single JSON section to PDF elements using AI-generated styles. """Render a single JSON section to PDF elements using AI-generated styles.
Supports three content formats: reference, object (base64), extracted_text. Supports three content formats: reference, object (base64), extracted_text.
@ -575,8 +688,10 @@ class RendererPdf(BaseRenderer):
content = element.get("content", "") content = element.get("content", "")
source = element.get("source", "") source = element.get("source", "")
if content: if content:
source_text = f" <i>(Source: {source})</i>" if source else "" bodyXml = self._markdownInlineToReportlabXml(content)
all_elements.append(Paragraph(f"{content}{source_text}", self._createNormalStyle(styles))) if source:
bodyXml = f"{bodyXml} <i>(Source: {self._escapeReportlabXml(source)})</i>"
all_elements.append(Paragraph(bodyXml, self._createNormalStyle(styles)))
all_elements.append(Spacer(1, 6)) all_elements.append(Spacer(1, 6))
continue continue
@ -618,10 +733,8 @@ class RendererPdf(BaseRenderer):
return [Paragraph(f"[Error rendering section: {str(e)}]", self._createNormalStyle(styles))] return [Paragraph(f"[Error rendering section: {str(e)}]", self._createNormalStyle(styles))]
def _renderJsonTable(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: def _renderJsonTable(self, table_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON table to PDF elements using AI-generated styles.""" """Render a JSON table: left-aligned, width capped to printable area, cells wrap."""
try: try:
# Handle nested content structure: element.content.headers vs element.headers
# Extract from nested content structure
content = table_data.get("content", {}) content = table_data.get("content", {})
if not isinstance(content, dict): if not isinstance(content, dict):
return [] return []
@ -631,30 +744,43 @@ class RendererPdf(BaseRenderer):
if not headers or not rows: if not headers or not rows:
return [] return []
# Prepare table data numCols = len(headers)
table_data_list = [headers] + rows colWidth = _PDF_CONTENT_WIDTH_PT / max(numCols, 1)
colWidths = [colWidth] * numCols
# Create table
table = Table(table_data_list) hdrPs = self._createTableCellParagraphStyle(styles, header=True, tableStyleKey="table_header")
cellPs = self._createTableCellParagraphStyle(styles, header=False, tableStyleKey="table_cell")
# Apply styling
def _cellPara(val, ps):
return self._paragraphFromInlineMarkdown(str(val) if val is not None else "", ps)
headerRow = [_cellPara(h, hdrPs) for h in headers]
bodyRows = []
for row in rows:
padded = list(row) + [""] * max(0, numCols - len(row))
padded = padded[:numCols]
bodyRows.append([_cellPara(c, cellPs) for c in padded])
table_matrix = [headerRow] + bodyRows
table = Table(table_matrix, colWidths=colWidths, repeatRows=1)
table_header_style = styles.get("table_header", {}) table_header_style = styles.get("table_header", {})
table_cell_style = styles.get("table_cell", {}) table_cell_style = styles.get("table_cell", {})
table_style = [ table_style = [
('BACKGROUND', (0, 0), (-1, 0), self._hexToColor(table_header_style.get("background", "#4F4F4F"))), ("BACKGROUND", (0, 0), (-1, 0), self._hexToColor(table_header_style.get("background", "#4F4F4F"))),
('TEXTCOLOR', (0, 0), (-1, 0), self._hexToColor(table_header_style.get("text_color", "#FFFFFF"))), ("BACKGROUND", (0, 1), (-1, -1), self._hexToColor(table_cell_style.get("background", "#FFFFFF"))),
('ALIGN', (0, 0), (-1, -1), self._getTableAlignment(table_cell_style.get("align", "left"))), ("ALIGN", (0, 0), (-1, -1), "LEFT"),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold' if table_header_style.get("bold", True) else 'Helvetica'), ("VALIGN", (0, 0), (-1, -1), "TOP"),
('FONTSIZE', (0, 0), (-1, 0), table_header_style.get("font_size", 12)), ("LEFTPADDING", (0, 0), (-1, -1), 4),
('BOTTOMPADDING', (0, 0), (-1, 0), 12), ("RIGHTPADDING", (0, 0), (-1, -1), 4),
('BACKGROUND', (0, 1), (-1, -1), self._hexToColor(table_cell_style.get("background", "#FFFFFF"))), ("TOPPADDING", (0, 0), (-1, 0), 6),
('FONTSIZE', (0, 1), (-1, -1), table_cell_style.get("font_size", 10)), ("BOTTOMPADDING", (0, 0), (-1, 0), 8),
('GRID', (0, 0), (-1, -1), 1, colors.black) ("TOPPADDING", (0, 1), (-1, -1), 4),
("BOTTOMPADDING", (0, 1), (-1, -1), 4),
("GRID", (0, 0), (-1, -1), 0.5, colors.black),
] ]
table.setStyle(TableStyle(table_style)) table.setStyle(TableStyle(table_style))
return [table, Spacer(1, 12)] return [table, Spacer(1, 12)]
except Exception as e: except Exception as e:
@ -674,9 +800,16 @@ class RendererPdf(BaseRenderer):
elements = [] elements = []
for item in items: for item in items:
if isinstance(item, str): if isinstance(item, str):
elements.append(Paragraph(f"{item}", self._createNormalStyle(styles))) elements.append(
Paragraph(f"{self._markdownInlineToReportlabXml(item)}", self._createNormalStyle(styles))
)
elif isinstance(item, dict) and "text" in item: elif isinstance(item, dict) and "text" in item:
elements.append(Paragraph(f"{item['text']}", self._createNormalStyle(styles))) elements.append(
Paragraph(
f"{self._markdownInlineToReportlabXml(item['text'])}",
self._createNormalStyle(styles),
)
)
if elements: if elements:
elements.append(Spacer(1, bullet_style_def.get("space_after", 3))) elements.append(Spacer(1, bullet_style_def.get("space_after", 3)))
@ -700,7 +833,7 @@ class RendererPdf(BaseRenderer):
if text: if text:
level = max(1, min(6, level)) level = max(1, min(6, level))
heading_style = self._createHeadingStyle(styles, level) heading_style = self._createHeadingStyle(styles, level)
return [Paragraph(text, heading_style)] return [self._paragraphFromInlineMarkdown(text, heading_style)]
return [] return []
@ -721,7 +854,7 @@ class RendererPdf(BaseRenderer):
text = "" text = ""
if text: if text:
return [Paragraph(text, self._createNormalStyle(styles))] return [self._paragraphFromInlineMarkdown(text, self._createNormalStyle(styles))]
return [] return []
@ -741,27 +874,81 @@ class RendererPdf(BaseRenderer):
code_style_def = styles.get("code_block", {}) code_style_def = styles.get("code_block", {})
if code: if code:
code = _prepareCodeBlockPlainText(code)
code = _normalizePdfMonospaceText(code)
elements = [] elements = []
fs = code_style_def.get("font_size", 9)
mono = code_style_def.get("font", "Courier")
if language: if language:
lang_style = ParagraphStyle( lang_style = ParagraphStyle(
'CodeLanguage', "CodeLanguage",
fontSize=code_style_def.get("font_size", 9), fontSize=fs,
textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")), textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")),
fontName='Helvetica-Bold' fontName="Helvetica-Bold",
alignment=TA_LEFT,
) )
elements.append(Paragraph(f"Code ({language}):", lang_style)) elements.append(
Paragraph(
code_style = ParagraphStyle( self._escapeReportlabXml(f"Code ({language}):"),
'CodeBlock', lang_style,
fontSize=code_style_def.get("font_size", 9), )
textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")), )
fontName=code_style_def.get("font", "Courier"),
backColor=self._hexToColor(code_style_def.get("background", "#F5F5F5")), approxCharWPt = max(fs * 0.52, 4.5)
spaceAfter=code_style_def.get("space_after", 6) usableWidth = _PDF_CONTENT_WIDTH_PT - 16 # left+right padding
) maxLineChars = max(48, int(usableWidth / approxCharWPt))
elements.append(Paragraph(code, code_style)) bg_col = self._hexToColor(code_style_def.get("background", "#F5F5F5"))
leading = fs * 1.2
spaceAfter = code_style_def.get("space_after", 6)
# Each source line may wrap to ceil(len/maxLineChars) visual lines.
# Frame height ~740pt minus padding → keep rendered height < 600pt.
maxVisualLinesPerChunk = max(8, int(600 / leading))
srcLines = code.split("\n")
chunks: List[List[str]] = []
curChunk: List[str] = []
curVisual = 0
for sl in srcLines:
wrapped = max(1, -(-len(sl) // maxLineChars)) if sl else 1
if curVisual + wrapped > maxVisualLinesPerChunk and curChunk:
chunks.append(curChunk)
curChunk = []
curVisual = 0
curChunk.append(sl)
curVisual += wrapped
if curChunk:
chunks.append(curChunk)
for ci, chunkLines in enumerate(chunks):
chunkText = "\n".join(chunkLines)
styleId = f"CodePre_{id(code_data) & 0xFFFFFFFF}_{ci}"
codePrStyle = ParagraphStyle(
styleId,
fontName=mono,
fontSize=fs,
leading=leading,
textColor=self._hexToColor(code_style_def.get("color", "#2F2F2F")),
alignment=TA_LEFT,
leftIndent=0,
rightIndent=0,
)
pf = Preformatted(chunkText, codePrStyle, dedent=0, maxLineLength=maxLineChars)
tbl = Table([[pf]], colWidths=[_PDF_CONTENT_WIDTH_PT])
tbl.setStyle(
TableStyle(
[
("BACKGROUND", (0, 0), (-1, -1), bg_col),
("VALIGN", (0, 0), (-1, -1), "TOP"),
("LEFTPADDING", (0, 0), (-1, -1), 8),
("RIGHTPADDING", (0, 0), (-1, -1), 8),
("TOPPADDING", (0, 0), (-1, -1), 6),
("BOTTOMPADDING", (0, 0), (-1, -1), 6),
]
)
)
tbl.spaceAfter = 0 if ci < len(chunks) - 1 else spaceAfter
elements.append(tbl)
return elements return elements
return [] return []

View file

@ -13,6 +13,15 @@ from modules.datamodels.datamodelDocument import RenderedDocument
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_PPTX_MD_INLINE_RE = re.compile(
r"(\*\*(.+?)\*\*)"
r"|(__(.+?)__)"
r"|(?<!\*)\*([^*\n]+?)\*(?!\*)"
r"|(?<![\w/])_([^_\n]+?)_(?![\w/])"
r"|`([^`]+)`"
)
class RendererPptx(BaseRenderer): class RendererPptx(BaseRenderer):
"""Renderer for PowerPoint (.pptx) files using python-pptx library.""" """Renderer for PowerPoint (.pptx) files using python-pptx library."""
@ -99,56 +108,75 @@ class RendererPptx(BaseRenderer):
for i, slide_data in enumerate(slidesData): for i, slide_data in enumerate(slidesData):
slide_sections = slide_data.get("sections", []) slide_sections = slide_data.get("sections", [])
slide_images = list(slide_data.get("images", [])) # Make copy so we can append slide_images = list(slide_data.get("images", []))
slide_content = slide_data.get('content', '') slide_content = slide_data.get('content', '')
hasSections = slide_sections and len(slide_sections) > 0 hasSections = slide_sections and len(slide_sections) > 0
hasImages = len(slide_images) > 0 hasImages = len(slide_images) > 0
isTitleSlide = slide_data.get("_isTitleSlide", False)
logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - sections: {len(slide_sections)}, images: {len(slide_images)}, content: {len(slide_content)} chars") logger.info(f"Slide {i+1}: '{slide_data.get('title', 'No title')}' - sections: {len(slide_sections)}, images: {len(slide_images)}, content: {len(slide_content)} chars, titleSlide={isTitleSlide}")
# Use blank layout for all slides to avoid placeholder interference # Title slide uses the built-in Title Slide layout (index 0)
# Find blank layout (typically index 6, fallback to 5) if isTitleSlide:
titleLayout = prs.slide_layouts[0]
slide = prs.slides.add_slide(titleLayout)
try:
titleShape = slide.shapes.title
titleShape.text = slide_data.get("title", "")
titleStyle = styles.get("title", {})
tf = titleShape.text_frame
if tf.paragraphs:
p = tf.paragraphs[0]
p.font.size = Pt(titleStyle.get("font_size", 36))
p.font.bold = titleStyle.get("bold", True)
tColor = self._getSafeColor(titleStyle.get("color", (31, 78, 121)))
p.font.color.rgb = RGBColor(*tColor)
except Exception as titleErr:
logger.warning(f"Could not style title slide: {titleErr}")
# Clear subtitle placeholder
try:
sub = slide.placeholders[1]
sub.text = ""
except (KeyError, IndexError):
pass
continue
# Content slides: use blank layout
slideLayoutIndex = None slideLayoutIndex = None
for idx in [6, 5]: for idx in [6, 5]:
if idx < len(prs.slide_layouts): if idx < len(prs.slide_layouts):
try: try:
layout = prs.slide_layouts[idx] layout = prs.slide_layouts[idx]
# Check if it's a blank layout (no placeholders)
if len(layout.placeholders) == 0: if len(layout.placeholders) == 0:
slideLayoutIndex = idx slideLayoutIndex = idx
break break
except (AttributeError, IndexError): except (AttributeError, IndexError):
continue continue
# If no blank layout found, use layout with fewest placeholders
if slideLayoutIndex is None: if slideLayoutIndex is None:
min_placeholders = float('inf') minPh = float('inf')
for idx in range(len(prs.slide_layouts)): for idx in range(len(prs.slide_layouts)):
try: try:
layout = prs.slide_layouts[idx] layout = prs.slide_layouts[idx]
placeholder_count = len(layout.placeholders) if hasattr(layout, 'placeholders') else 0 phCount = len(layout.placeholders) if hasattr(layout, 'placeholders') else 0
if placeholder_count < min_placeholders: if phCount < minPh:
min_placeholders = placeholder_count minPh = phCount
slideLayoutIndex = idx slideLayoutIndex = idx
except: except:
continue continue
# Fallback to first layout if still None
if slideLayoutIndex is None: if slideLayoutIndex is None:
slideLayoutIndex = 0 slideLayoutIndex = 0
slide_layout = prs.slide_layouts[slideLayoutIndex] slide_layout = prs.slide_layouts[slideLayoutIndex]
slide = prs.slides.add_slide(slide_layout) slide = prs.slides.add_slide(slide_layout)
# Clear placeholder text instead of removing placeholders (safer approach)
# This avoids corrupting the PPTX file structure
try: try:
for shape in slide.shapes: for shape in slide.shapes:
if hasattr(shape, 'is_placeholder') and shape.is_placeholder: if hasattr(shape, 'is_placeholder') and shape.is_placeholder:
try: try:
if hasattr(shape, 'text_frame'): if hasattr(shape, 'text_frame'):
shape.text_frame.clear() shape.text_frame.clear()
# Set text to empty string to remove "Click to add text"
if len(shape.text_frame.paragraphs) > 0: if len(shape.text_frame.paragraphs) > 0:
shape.text_frame.paragraphs[0].text = "" shape.text_frame.paragraphs[0].text = ""
except: except:
@ -156,7 +184,7 @@ class RendererPptx(BaseRenderer):
except Exception as placeholder_error: except Exception as placeholder_error:
logger.warning(f"Could not clear placeholders: {str(placeholder_error)}") logger.warning(f"Could not clear placeholders: {str(placeholder_error)}")
# Add title as textbox (smaller size for slides) # Add title as textbox
from pptx.util import Inches from pptx.util import Inches
titleBox = slide.shapes.add_textbox(Inches(0.5), Inches(0.2), prs.slide_width - Inches(1), Inches(0.6)) titleBox = slide.shapes.add_textbox(Inches(0.5), Inches(0.2), prs.slide_width - Inches(1), Inches(0.6))
titleFrame = titleBox.text_frame titleFrame = titleBox.text_frame
@ -232,15 +260,14 @@ class RendererPptx(BaseRenderer):
else: else:
p.alignment = PP_ALIGN.LEFT p.alignment = PP_ALIGN.LEFT
# If no slides were created, create a default slide # If no slides were created, create a single slide with the document title
if not slidesData: if not slidesData:
slide_layout = prs.slide_layouts[0] # Title slide layout slide_layout = prs.slide_layouts[0]
slide = prs.slides.add_slide(slide_layout) slide = prs.slides.add_slide(slide_layout)
title_shape = slide.shapes.title title_shape = slide.shapes.title
title_shape.text = title title_shape.text = title
# Apply title styling to default slide
title_style = styles.get("title", {}) title_style = styles.get("title", {})
if title_shape.text_frame.paragraphs[0].font: if title_shape.text_frame.paragraphs[0].font:
title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48)) title_shape.text_frame.paragraphs[0].font.size = Pt(title_style.get("font_size", 48))
@ -248,16 +275,12 @@ class RendererPptx(BaseRenderer):
title_color = self._getSafeColor(title_style.get("color", (31, 78, 121))) title_color = self._getSafeColor(title_style.get("color", (31, 78, 121)))
title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color) title_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*title_color)
subtitle_shape = slide.placeholders[1] # Clear subtitle placeholder instead of adding filler text
subtitle_shape.text = "Generated by PowerOn AI System" try:
subtitle_shape = slide.placeholders[1]
# Apply subtitle styling subtitle_shape.text = ""
paragraph_style = styles.get("paragraph", {}) except (KeyError, IndexError):
if subtitle_shape.text_frame.paragraphs[0].font: pass
subtitle_shape.text_frame.paragraphs[0].font.size = Pt(paragraph_style.get("font_size", 20))
subtitle_shape.text_frame.paragraphs[0].font.bold = paragraph_style.get("bold", False)
paragraph_color = self._get_safe_color(paragraph_style.get("color", (47, 47, 47)))
subtitle_shape.text_frame.paragraphs[0].font.color.rgb = RGBColor(*paragraph_color)
# Save to buffer # Save to buffer
buffer = io.BytesIO() buffer = io.BytesIO()
@ -625,24 +648,23 @@ JSON ONLY. NO OTHER TEXT."""
sections = self._extractSections(json_content) sections = self._extractSections(json_content)
metadata = self._extractMetadata(json_content) metadata = self._extractMetadata(json_content)
# Use provided title (which comes from documents[].title) as primary source
# Fallback to metadata.title only if title parameter is empty
document_title = title if title else metadata.get("title", "Generated Document") document_title = title if title else metadata.get("title", "Generated Document")
# Create title slide # Title slide (clean — just the document title, no filler text)
slides.append({ slides.append({
"title": document_title, "title": document_title,
"content": "Generated by PowerOn AI System\n\n" + self._formatTimestamp() "content": "",
"_isTitleSlide": True,
}) })
# Process sections into slides based on content and user intent # Content slides split by chapter headings
slides.extend(self._createSlidesFromSections(sections, styles)) contentSlides = self._createSlidesFromSections(sections, styles)
if contentSlides:
# If no content slides were created, create a default content slide slides.extend(contentSlides)
if len(slides) == 1: # Only title slide else:
slides.append({ slides.append({
"title": "Content Overview", "title": "Content Overview",
"content": "No structured content found in the source documents.\n\nPlease check the source documents and try again." "content": ""
}) })
return slides return slides
@ -941,9 +963,8 @@ JSON ONLY. NO OTHER TEXT."""
content = slide_data.get("content", "") content = slide_data.get("content", "")
title = slide_data.get("title", "") title = slide_data.get("title", "")
# Check if it's a title slide (first slide) if not content:
if not content or "Generated by PowerOn AI System" in content: return 0
return 0 # Title slide layout
# Professional layout selection based on content # Professional layout selection based on content
if "|" in content and "-" in content: if "|" in content and "-" in content:
@ -970,67 +991,71 @@ JSON ONLY. NO OTHER TEXT."""
return 1 # Default to title and content layout return 1 # Default to title and content layout
def _createSlidesFromSections(self, sections: List[Dict[str, Any]], styles: Dict[str, Any]) -> List[Dict[str, Any]]: def _createSlidesFromSections(self, sections: List[Dict[str, Any]], styles: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Create slides from sections: each heading level 1 (chapter) creates a new slide, content accumulates until next level 1 heading.""" """Create slides from sections: each top-level heading creates a new slide.
The split level is determined dynamically: if there is exactly one H1 (the
document title), chapters are H2; otherwise chapters are H1.
"""
try: try:
# First pass: discover heading levels to choose the split level
headingLevels: List[int] = []
for section in sections:
if section.get("content_type") == "heading":
for el in section.get("elements", []):
if isinstance(el, dict):
c = el.get("content", {})
if isinstance(c, dict):
headingLevels.append(c.get("level", 1))
h1Count = headingLevels.count(1)
h2Count = headingLevels.count(2)
# If there's at most one H1 but multiple H2s, split on H2
splitLevel = 2 if h1Count <= 1 and h2Count > 1 else 1
slides = [] slides = []
current_slide_sections = [] # Store sections (not formatted text) for proper rendering currentSlideSections = []
current_slide_title = "Content Overview" currentSlideTitle = "Content Overview"
for section in sections: for section in sections:
section_type = section.get("content_type", "paragraph") sectionType = section.get("content_type", "paragraph")
elements = section.get("elements", []) elements = section.get("elements", [])
# Skip sections with no elements (unless they're headings that should create new slides) if not elements and sectionType != "heading":
if not elements and section_type != "heading":
continue continue
if section_type == "heading": if sectionType == "heading":
# Extract heading level level = 1
level = 1 # Default headingText = ""
heading_text = ""
for element in elements: for element in elements:
if isinstance(element, dict): if isinstance(element, dict):
# Extract from nested content structure
content = element.get("content", {}) content = element.get("content", {})
if isinstance(content, dict): if isinstance(content, dict):
heading_text = content.get("text", "") headingText = content.get("text", "")
level = content.get("level", 1) level = content.get("level", 1)
elif isinstance(content, str): elif isinstance(content, str):
heading_text = content headingText = content
level = 1 level = 1
# Only level 1 headings (chapters) create new slides if level <= splitLevel:
if level == 1: if currentSlideSections:
# If we have accumulated content, create a slide
if current_slide_sections:
slides.append({ slides.append({
"title": current_slide_title, "title": currentSlideTitle,
"sections": current_slide_sections.copy(), # Store sections for proper rendering "sections": currentSlideSections.copy(),
"images": [] "images": []
}) })
current_slide_sections = [] currentSlideSections = []
currentSlideTitle = headingText or section.get("id", "Untitled Section")
# Start new slide with heading as title
if heading_text:
current_slide_title = heading_text
else:
# If no heading text found but this is a heading section, use section ID or default
current_slide_title = section.get("id", "Untitled Section")
else: else:
# Level 2+ headings are added as sections to current slide currentSlideSections.append(section)
current_slide_sections.append(section) elif sectionType == "image":
elif section_type == "image": currentSlideSections.append(section)
# Images are added to current slide (will be organized in frames)
current_slide_sections.append(section)
else: else:
# Add section to current slide (will be rendered properly) currentSlideSections.append(section)
current_slide_sections.append(section)
# Add final slide if there's content if currentSlideSections:
if current_slide_sections:
slides.append({ slides.append({
"title": current_slide_title, "title": currentSlideTitle,
"sections": current_slide_sections.copy(), "sections": currentSlideSections.copy(),
"images": [] "images": []
}) })
@ -1225,14 +1250,66 @@ JSON ONLY. NO OTHER TEXT."""
import traceback import traceback
logger.error(f"Traceback: {traceback.format_exc()}") logger.error(f"Traceback: {traceback.format_exc()}")
def _addTableToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], top: float, max_width: float = None) -> None: def _addMarkdownInlineRuns(self, paragraph, text: str, fontSize=None, fontColor=None, fontBold=None) -> None:
"""Parse markdown inline formatting and add Runs to a pptx paragraph.
Every piece of text is added as an explicit Run with font properties set,
so the paragraph never falls back to the slide-master default font.
"""
from pptx.util import Pt
paragraph.text = ""
def _applyBase(run, bold=None):
if fontSize:
run.font.size = fontSize
if fontColor:
run.font.color.rgb = fontColor
if bold is not None:
run.font.bold = bold
elif fontBold is not None:
run.font.bold = fontBold
pos = 0
for m in _PPTX_MD_INLINE_RE.finditer(text):
if m.start() > pos:
r = paragraph.add_run()
r.text = text[pos:m.start()]
_applyBase(r)
if m.group(2) or m.group(4):
r = paragraph.add_run()
r.text = m.group(2) or m.group(4)
_applyBase(r, bold=True)
elif m.group(5) or m.group(6):
r = paragraph.add_run()
r.text = m.group(5) or m.group(6)
r.font.italic = True
_applyBase(r)
elif m.group(7):
r = paragraph.add_run()
r.text = m.group(7)
r.font.name = "Courier New"
if fontSize and hasattr(fontSize, 'pt'):
r.font.size = Pt(max(8, int(fontSize.pt * 0.85)))
elif fontSize:
r.font.size = fontSize
if fontColor:
r.font.color.rgb = fontColor
pos = m.end()
# Remaining tail (or entire string if no matches)
if pos < len(text):
r = paragraph.add_run()
r.text = text[pos:]
_applyBase(r)
def _addTableToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], top: float = None, max_width: float = None) -> None:
"""Add a PowerPoint table to slide.""" """Add a PowerPoint table to slide."""
try: try:
from pptx.util import Inches, Pt from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor from pptx.dml.color import RGBColor
# Extract from nested content structure
content = element.get("content", {}) content = element.get("content", {})
if not isinstance(content, dict): if not isinstance(content, dict):
return return
@ -1243,11 +1320,9 @@ JSON ONLY. NO OTHER TEXT."""
if not headers: if not headers:
return return
# Calculate table dimensions num_cols = int(len(headers))
num_cols = int(len(headers)) # Ensure integer num_rows = int(len(rows) + 1)
num_rows = int(len(rows) + 1) # +1 for header row, ensure integer
left = Inches(0.5) left = Inches(0.5)
# Get presentation from stored reference or slide
if hasattr(self, '_currentPresentation'): if hasattr(self, '_currentPresentation'):
prs = self._currentPresentation prs = self._currentPresentation
else: else:
@ -1255,7 +1330,15 @@ JSON ONLY. NO OTHER TEXT."""
width = max_width if max_width is not None else (prs.slide_width - Inches(1)) width = max_width if max_width is not None else (prs.slide_width - Inches(1))
row_height = Inches(0.4) row_height = Inches(0.4)
# Create table - ensure all parameters are proper types # Auto-calculate top from existing shapes when not specified
if top is None:
maxBottom = Inches(1.5)
for shape in slide.shapes:
shapeBottom = shape.top + shape.height
if shapeBottom > maxBottom:
maxBottom = shapeBottom
top = maxBottom + Inches(0.15)
table_height = row_height * num_rows table_height = row_height * num_rows
table_shape = slide.shapes.add_table(num_rows, num_cols, left, top, width, table_height) table_shape = slide.shapes.add_table(num_rows, num_cols, left, top, width, table_height)
table = table_shape.table table = table_shape.table
@ -1361,109 +1444,49 @@ JSON ONLY. NO OTHER TEXT."""
logger.warning(f"Error adding table to slide: {str(e)}") logger.warning(f"Error adding table to slide: {str(e)}")
def _addBulletListToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None: def _addBulletListToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None:
"""Add bullet list to slide text frame.""" """Add bullet list to slide text frame with consistent formatting."""
try: try:
from pptx.util import Pt from pptx.util import Pt
from pptx.dml.color import RGBColor from pptx.dml.color import RGBColor
from pptx.enum.text import PP_ALIGN from pptx.enum.text import PP_ALIGN
# Extract from nested content structure
content = element.get("content", {}) content = element.get("content", {})
if not isinstance(content, dict): if not isinstance(content, dict):
return return
items = content.get("items", []) items = content.get("items", [])
if not items: if not items:
return return
list_style = styles.get("bullet_list", {}) listStyle = styles.get("paragraph", {})
base_font_size = list_style.get("font_size", 14) fontSize = Pt(max(10, int(listStyle.get("font_size", 14) * font_size_multiplier)))
calculated_size = max(10, int(base_font_size * font_size_multiplier)) # Minimum 10pt for readability fontColor = RGBColor(*self._getSafeColor(listStyle.get("color", (47, 47, 47))))
# Pre-calculate and cache style objects to avoid repeated parsing for item in items:
font_size_pt = Pt(calculated_size) itemText = item.get("text", "") if isinstance(item, dict) else str(item)
text_color = self._getSafeColor(list_style.get("color", (47, 47, 47))) if not itemText or not itemText.strip():
text_color_rgb = RGBColor(*text_color)
space_before_pt = Pt(2)
space_after_pt = Pt(2)
logger.debug(f"Rendering bullet list with {len(items)} items")
for idx, item in enumerate(items):
try:
# Get text content first
if isinstance(item, dict):
item_text = item.get("text", "")
else:
item_text = str(item)
# Skip empty items
if not item_text or len(item_text.strip()) == 0:
logger.debug(f"Skipping empty bullet item {idx}")
continue
# Create new paragraph for each bullet item
p = text_frame.add_paragraph()
# Set level to 1 for bullet points BEFORE setting text
# In python-pptx, setting level > 0 should automatically enable bullets
p.level = 1
# Set text content
p.text = item_text
# Apply formatting - use cached objects
p.font.size = font_size_pt
p.font.color.rgb = text_color_rgb
p.alignment = PP_ALIGN.LEFT # Left align bullet lists
p.space_before = space_before_pt # Small spacing before
p.space_after = space_after_pt # Small spacing after
# In python-pptx, setting level > 0 should enable bullets automatically
# However, some versions may not support paragraph_format, so we'll use manual bullets as fallback
# Always add manual bullet character to ensure visibility
if not (p.text.startswith('') or p.text.startswith('-') or p.text.startswith('*') or p.text.startswith('')):
p.text = '' + p.text
logger.debug(f"Added manual bullet character to item {idx}")
# Set proper indentation for multiline bullets (hanging indent)
# For multiline bullets: bullet at left margin, text indented, wrapped lines align with text
try:
# Try accessing paragraph_format - it may not exist in all python-pptx versions
if hasattr(p, 'paragraph_format'):
pf = p.paragraph_format
# Left indent: indents the entire paragraph (bullet + text)
pf.left_indent = Pt(18)
# First line indent: negative value creates hanging indent
# This brings the bullet back to the left while keeping text indented
pf.first_line_indent = Pt(-18) # Negative to create hanging indent
logger.debug(f"Set hanging indent for bullet item {idx}")
else:
# Try via _element if paragraph_format not available
try:
from pptx.util import Pt as PtUtil
pPr = p._element.get_or_add_pPr()
# Set left margin (indents entire paragraph)
pPr.left_margin = PtUtil(18)
# Set first line indent (negative for hanging indent)
pPr.first_line_indent = PtUtil(-18)
logger.debug(f"Set hanging indent via XML for bullet item {idx}")
except Exception as xml_error:
logger.debug(f"Could not set hanging indent via XML: {str(xml_error)}")
# Indentation is optional, continue without it
pass
except Exception as indent_error:
logger.debug(f"Could not set indent for item {idx}: {str(indent_error)}")
# Continue without indent - bullets will still show, but multiline won't be properly indented
logger.debug(f"Successfully added bullet item {idx}: '{item_text[:50]}...'")
except Exception as item_error:
logger.error(f"Error adding bullet item {idx}: {str(item_error)}", exc_info=True)
# Continue with next item even if one fails
continue continue
logger.debug(f"Completed rendering bullet list, added {len(text_frame.paragraphs)} paragraphs") p = text_frame.add_paragraph()
p.level = 0
p.alignment = PP_ALIGN.LEFT
p.space_before = Pt(2)
p.space_after = Pt(2)
# Consistent bullet prefix
self._addMarkdownInlineRuns(p, f"{itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
# Subitems
if isinstance(item, dict):
for sub in item.get("subitems", []):
subText = sub.get("text", "") if isinstance(sub, dict) else str(sub)
if not subText:
continue
sp = text_frame.add_paragraph()
sp.level = 0
sp.alignment = PP_ALIGN.LEFT
sp.space_before = Pt(1)
sp.space_after = Pt(1)
self._addMarkdownInlineRuns(sp, f" {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
except Exception as e: except Exception as e:
logger.warning(f"Error adding bullet list to slide: {str(e)}") logger.warning(f"Error adding bullet list to slide: {str(e)}")
@ -1484,25 +1507,22 @@ JSON ONLY. NO OTHER TEXT."""
if text: if text:
p = text_frame.add_paragraph() p = text_frame.add_paragraph()
p.text = text
# Headings should be level 0 (no indentation) regardless of heading level
p.level = 0 p.level = 0
heading_style = styles.get("heading", {}) heading_style = styles.get("heading", {})
# Different font sizes for different heading levels
if level == 1: if level == 1:
base_font_size = heading_style.get("font_size", 28) # Largest for H1 base_font_size = heading_style.get("font_size", 28)
elif level == 2: elif level == 2:
base_font_size = heading_style.get("font_size", 22) # Medium for H2 base_font_size = heading_style.get("font_size", 22)
elif level == 3: elif level == 3:
base_font_size = heading_style.get("font_size", 18) # Smaller for H3 base_font_size = heading_style.get("font_size", 18)
else: else:
base_font_size = heading_style.get("font_size", 16) # Default for H4+ base_font_size = heading_style.get("font_size", 16)
calculated_size = max(12, int(base_font_size * font_size_multiplier)) # Minimum 12pt for headings calculated_size = max(12, int(base_font_size * font_size_multiplier))
p.font.size = Pt(calculated_size) fSize = Pt(calculated_size)
p.font.bold = heading_style.get("bold", True) fColor = RGBColor(*self._getSafeColor(heading_style.get("color", (31, 78, 121))))
p.font.color.rgb = RGBColor(*self._getSafeColor(heading_style.get("color", (31, 78, 121)))) self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=True)
# Add spacing before and after headings # Add spacing before and after headings
p.space_before = Pt(12 if level == 1 else 8) # More space before H1 p.space_before = Pt(12 if level == 1 else 8) # More space before H1
p.space_after = Pt(6) # Space after heading p.space_after = Pt(6) # Space after heading
@ -1528,11 +1548,8 @@ JSON ONLY. NO OTHER TEXT."""
if text: if text:
p = text_frame.add_paragraph() p = text_frame.add_paragraph()
p.text = text
# Explicitly set level to 0 for regular paragraphs (not bullets)
p.level = 0 p.level = 0
# Ensure no bullet formatting
try: try:
if hasattr(p, 'paragraph_format'): if hasattr(p, 'paragraph_format'):
p.paragraph_format.bullet.type = None p.paragraph_format.bullet.type = None
@ -1540,11 +1557,12 @@ JSON ONLY. NO OTHER TEXT."""
pass pass
paragraph_style = styles.get("paragraph", {}) paragraph_style = styles.get("paragraph", {})
base_font_size = paragraph_style.get("font_size", 14) # Smaller default for better readability base_font_size = paragraph_style.get("font_size", 14)
calculated_size = max(10, int(base_font_size * font_size_multiplier)) # Minimum 10pt for readability calculated_size = max(10, int(base_font_size * font_size_multiplier))
p.font.size = Pt(calculated_size) fSize = Pt(calculated_size)
p.font.bold = paragraph_style.get("bold", False) fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47))))
p.font.color.rgb = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))) fBold = paragraph_style.get("bold", False)
self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=fBold)
# Add proper spacing # Add proper spacing
p.space_before = Pt(6) # Space before paragraph p.space_before = Pt(6) # Space before paragraph
@ -1604,261 +1622,31 @@ JSON ONLY. NO OTHER TEXT."""
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC") return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
def _renderSlideContentWithFrames(self, slide, slide_sections: List[Dict[str, Any]], slide_images: List[Dict[str, Any]], styles: Dict[str, Any], prs) -> None: def _renderSlideContentWithFrames(self, slide, slide_sections: List[Dict[str, Any]], slide_images: List[Dict[str, Any]], styles: Dict[str, Any], prs) -> None:
""" """Render all sections sequentially: text/bullets/headings into a shared
Organize slide content into frames for better layout. textbox, tables and images as separate shapes placed below."""
Groups content by type (images, bullet lists, paragraphs, tables) and renders each in appropriately sized frames.
"""
try: try:
from pptx.util import Inches, Pt from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor
# Extract images from sections first
images_to_render = list(slide_images) if slide_images else []
text_sections = []
table_sections = []
for section in slide_sections:
section_type = section.get("content_type", "paragraph")
elements = section.get("elements", [])
if not elements:
# Skip empty sections
continue
# Extract images from all sections
section_has_images = False
for element in elements:
if isinstance(element, dict) and element.get("type") == "image":
content = element.get("content", {})
base64Data = None
# Handle different content formats
if isinstance(content, dict):
base64Data = content.get("base64Data")
altText = content.get("altText", "Image")
caption = content.get("caption", "")
elif isinstance(content, str):
# If content is a string, it might be base64 data directly
# Check if it looks like base64
if len(content) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in content[:100]):
base64Data = content
altText = "Image"
caption = ""
else:
# Not base64, skip
continue
else:
# Try to get base64Data directly from element
base64Data = element.get("base64Data")
altText = element.get("altText", "Image")
caption = element.get("caption", "")
if base64Data:
images_to_render.append({
"base64Data": base64Data,
"altText": altText,
"caption": caption
})
section_has_images = True
# Skip image-only sections (they're already added to images_to_render)
if section_type == "image" and section_has_images:
continue
# Categorize sections (excluding image elements)
has_table = False
non_image_elements = []
for element in elements:
if isinstance(element, dict):
element_type = element.get("type", "")
# Skip image elements when categorizing
if element_type == "image":
continue
if element_type == "table" or section_type == "table":
has_table = True
non_image_elements.append(element)
# Only add sections that have non-image content
if non_image_elements:
if has_table:
# Create a copy of section without image elements for table rendering
table_section = {
**section,
"elements": non_image_elements
}
table_sections.append(table_section)
else:
# Create a copy of section without image elements for text rendering
text_section = {
**section,
"elements": non_image_elements
}
text_sections.append(text_section)
# Calculate layout dimensions
title_height = Inches(1.5)
available_height = prs.slide_height - title_height - Inches(0.5) # Title + margin
available_width = prs.slide_width - Inches(1) # Margins
margin = Inches(0.5) margin = Inches(0.5)
contentTop = Inches(1.3)
current_y = title_height + Inches(0.3) availableWidth = prs.slide_width - Inches(1)
availableHeight = prs.slide_height - contentTop - Inches(0.3)
# Determine layout strategy based on content types
has_images = len(images_to_render) > 0 # Create a single textbox for all non-table, non-image content
has_tables = len(table_sections) > 0 textbox = slide.shapes.add_textbox(margin, contentTop, availableWidth, availableHeight)
has_text = len(text_sections) > 0 textFrame = textbox.text_frame
textFrame.word_wrap = True
# Layout 1: Images + Text (horizontal split for landscape) textFrame.auto_size = None
if has_images and has_text and not has_tables:
# Horizontal split: images on left, text on right (landscape format)
img_width = available_width * 0.48
text_width = available_width * 0.48
img_left = margin
text_left = margin + img_width + Inches(0.2)
# Render images in left column (full height)
if images_to_render:
img_height = available_height - Inches(0.2)
self._addImagesToSlideInFrame(slide, images_to_render, styles, img_left, current_y, img_width, img_height)
# Render text in right column (full height, adaptive font size)
if text_sections:
text_height = available_height - Inches(0.2)
self._renderTextSectionsInFrame(slide, text_sections, styles, text_left, current_y, text_width, text_height, adaptiveFontSize=True)
# Layout 2: Tables + Text (horizontal split for landscape)
elif has_tables and has_text:
# Horizontal split: tables on left, text on right (landscape format)
table_width = available_width * 0.48
text_width = available_width * 0.48
table_left = margin
text_left = margin + table_width + Inches(0.2)
# Render tables in left column (full height)
table_y = current_y
for table_section in table_sections:
elements = table_section.get("elements", [])
for element in elements:
if isinstance(element, dict) and element.get("type") == "table":
try:
self._addTableToSlide(slide, element, styles, table_y, max_width=table_width)
# Calculate actual table height
content = element.get("content", {})
if isinstance(content, dict):
rows = content.get("rows", [])
num_rows = len(rows) + 1 # +1 for header
actual_height = Inches(0.4) * num_rows
table_y += actual_height + Inches(0.15)
else:
table_y += Inches(2)
except Exception as table_error:
logger.error(f"Error rendering table: {str(table_error)}")
# Continue with next table
break
# Render text in right column (full height, adaptive font size)
if text_sections:
text_height = available_height - Inches(0.2)
self._renderTextSectionsInFrame(slide, text_sections, styles, text_left, current_y, text_width, text_height, adaptiveFontSize=True)
# Layout 3: Images + Tables + Text (horizontal split for landscape)
elif has_images and has_tables and has_text:
# Horizontal split: Images (left), Tables (middle), Text (right)
img_width = available_width * 0.31
table_width = available_width * 0.31
text_width = available_width * 0.31
img_left = margin
table_left = margin + img_width + Inches(0.15)
text_left = margin + img_width + table_width + Inches(0.3)
# Render images in left column (full height)
if images_to_render:
img_height = available_height - Inches(0.2)
self._addImagesToSlideInFrame(slide, images_to_render, styles, img_left, current_y, img_width, img_height)
# Render tables in middle column (full height)
table_y = current_y
for table_section in table_sections:
elements = table_section.get("elements", [])
for element in elements:
if isinstance(element, dict) and element.get("type") == "table":
try:
self._addTableToSlide(slide, element, styles, table_y, max_width=table_width)
content = element.get("content", {})
if isinstance(content, dict):
rows = content.get("rows", [])
num_rows = len(rows) + 1
actual_height = Inches(0.4) * num_rows
table_y += actual_height + Inches(0.15)
else:
table_y += Inches(2)
except Exception as table_error:
logger.error(f"Error rendering table: {str(table_error)}")
break
# Render text in right column (full height, adaptive font size)
if text_sections:
text_height = available_height - Inches(0.2)
self._renderTextSectionsInFrame(slide, text_sections, styles, text_left, current_y, text_width, text_height, adaptiveFontSize=True)
# Layout 4: Images only
elif has_images and not has_text and not has_tables:
img_width = available_width * 0.8
img_height = available_height * 0.8
img_left = (available_width - img_width) / 2 + margin
self._addImagesToSlideInFrame(slide, images_to_render, styles, img_left, current_y, img_width, img_height)
# Layout 5: Text only (default, adaptive font size)
elif has_text and not has_images and not has_tables:
text_height = available_height - Inches(0.2)
self._renderTextSectionsInFrame(slide, text_sections, styles, margin, current_y, available_width, text_height, adaptiveFontSize=True)
# Layout 6: Tables only
elif has_tables and not has_images and not has_text:
table_height = available_height / max(len(table_sections), 1)
table_width = available_width
for table_section in table_sections:
elements = table_section.get("elements", [])
for element in elements:
if isinstance(element, dict) and element.get("type") == "table":
try:
self._addTableToSlide(slide, element, styles, current_y, max_width=table_width)
# Calculate actual table height
content = element.get("content", {})
if isinstance(content, dict):
rows = content.get("rows", [])
num_rows = len(rows) + 1 # +1 for header
actual_height = min(Inches(0.4) * num_rows, table_height)
current_y += actual_height + Inches(0.2)
else:
current_y += table_height + Inches(0.2)
except Exception as table_error:
logger.error(f"Error rendering table: {str(table_error)}")
# Continue with next table
break
except Exception as e:
logger.error(f"Error rendering slide content with frames: {str(e)}")
# Fallback to simple rendering
try:
content_shape = slide.placeholders[1]
text_frame = content_shape.text_frame
text_frame.clear()
except (AttributeError, IndexError):
from pptx.util import Inches
left = Inches(0.5)
top = Inches(1.5)
width = prs.slide_width - Inches(1)
height = prs.slide_height - top - Inches(0.5)
textbox = slide.shapes.add_textbox(left, top, width, height)
text_frame = textbox.text_frame
text_frame.word_wrap = True
# Simple fallback rendering
for section in slide_sections: for section in slide_sections:
self._renderSectionToTextFrame(slide, section, styles, text_frame, font_size_multiplier=1.0) self._renderSectionToTextFrame(slide, section, styles, textFrame, font_size_multiplier=1.0)
# Render standalone images that were passed alongside sections
if slide_images:
self._addImagesToSlideInFrame(slide, slide_images, styles, margin, contentTop, availableWidth, availableHeight)
except Exception as e:
logger.error(f"Error rendering slide content: {str(e)}")
def _renderTextSectionsInFrame(self, slide, text_sections: List[Dict[str, Any]], styles: Dict[str, Any], left: float, top: float, width: float, height: float, adaptiveFontSize: bool = False) -> None: def _renderTextSectionsInFrame(self, slide, text_sections: List[Dict[str, Any]], styles: Dict[str, Any], left: float, top: float, width: float, height: float, adaptiveFontSize: bool = False) -> None:
"""Render text sections (paragraphs, lists, headings) in a text frame.""" """Render text sections (paragraphs, lists, headings) in a text frame."""
@ -1935,6 +1723,14 @@ JSON ONLY. NO OTHER TEXT."""
except Exception as e: except Exception as e:
logger.warning(f"Error rendering text sections in frame: {str(e)}") logger.warning(f"Error rendering text sections in frame: {str(e)}")
@staticmethod
def _isHorizontalRule(element: Dict[str, Any]) -> bool:
"""Detect markdown horizontal rules (---, ***, ___) that should be skipped on slides."""
content = element.get("content", {})
text = content.get("text", "") if isinstance(content, dict) else (content if isinstance(content, str) else "")
stripped = text.strip()
return bool(stripped) and all(c in "-*_ " for c in stripped) and len(stripped.replace(" ", "")) >= 3
def _renderSectionToTextFrame(self, slide, section: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None: def _renderSectionToTextFrame(self, slide, section: Dict[str, Any], styles: Dict[str, Any], text_frame, font_size_multiplier: float = 1.0) -> None:
"""Render a single section to a text frame.""" """Render a single section to a text frame."""
try: try:
@ -1942,7 +1738,7 @@ JSON ONLY. NO OTHER TEXT."""
from pptx.enum.text import PP_ALIGN from pptx.enum.text import PP_ALIGN
from pptx.dml.color import RGBColor from pptx.dml.color import RGBColor
section_type = section.get("content_type", "paragraph") sectionType = section.get("content_type", "paragraph")
elements = section.get("elements", []) elements = section.get("elements", [])
if not elements: if not elements:
@ -1952,54 +1748,42 @@ JSON ONLY. NO OTHER TEXT."""
if not isinstance(element, dict): if not isinstance(element, dict):
continue continue
element_type = element.get("type", "") elementType = element.get("type", "") or sectionType
if not element_type:
element_type = section_type if elementType == "image":
continue
# Skip images - handled separately
if element_type == "image": # Skip horizontal rules (---, ***, ___)
if elementType == "paragraph" and self._isHorizontalRule(element):
continue continue
if element_type == "bullet_list" or element_type == "list": if elementType == "table":
self._addTableToSlide(slide, element, styles)
elif elementType in ("bullet_list", "list"):
self._addBulletListToSlide(slide, element, styles, text_frame, font_size_multiplier) self._addBulletListToSlide(slide, element, styles, text_frame, font_size_multiplier)
elif element_type == "heading": elif elementType == "heading":
self._addHeadingToSlide(slide, element, styles, text_frame, font_size_multiplier) self._addHeadingToSlide(slide, element, styles, text_frame, font_size_multiplier)
elif element_type == "paragraph": elif elementType == "paragraph":
self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier) self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier)
elif element_type == "code_block" or element_type == "code": elif elementType in ("code_block", "code"):
self._addCodeBlockToSlide(slide, element, styles, text_frame, font_size_multiplier) self._addCodeBlockToSlide(slide, element, styles, text_frame, font_size_multiplier)
elif element_type == "extracted_text": elif elementType == "extracted_text":
content = element.get("content", "") content = element.get("content", "")
source = element.get("source", "")
if content: if content:
paragraph_style = styles.get("paragraph", {})
p = text_frame.add_paragraph() p = text_frame.add_paragraph()
p.text = content pStyle = styles.get("paragraph", {})
base_font_size = paragraph_style.get("font_size", 18) fSize = Pt(max(10, int(pStyle.get("font_size", 14) * font_size_multiplier)))
p.font.size = Pt(int(base_font_size * font_size_multiplier)) fColor = RGBColor(*self._getSafeColor(pStyle.get("color", (47, 47, 47))))
p.font.bold = paragraph_style.get("bold", False) self._addMarkdownInlineRuns(p, content, fontSize=fSize, fontColor=fColor)
p.font.color.rgb = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47))))
p.alignment = PP_ALIGN.LEFT p.alignment = PP_ALIGN.LEFT
if source: elif elementType == "reference":
p.add_run(f" (Source: {source})").font.italic = True
elif element_type == "reference":
label = element.get("label", "Reference") label = element.get("label", "Reference")
p = text_frame.add_paragraph() p = text_frame.add_paragraph()
p.text = f"[Reference: {label}]" p.text = f"[Reference: {label}]"
p.font.italic = True p.font.italic = True
p.alignment = PP_ALIGN.LEFT p.alignment = PP_ALIGN.LEFT
else: else:
# Fallback to paragraph self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier)
content = element.get("content", "")
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
self._addParagraphToSlide(slide, element, styles, text_frame, font_size_multiplier=1.0)
except Exception as e: except Exception as e:
logger.warning(f"Error rendering section to text frame: {str(e)}") logger.warning(f"Error rendering section to text frame: {str(e)}")

View file

@ -0,0 +1,253 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Smoke test: RendererPdf with every JSON section/element shape the pipeline supports.
Canonical section types (datamodelJson.supportedSectionTypes): table, bullet_list, heading,
paragraph, code_block, image.
PDF renderer additionally handles element types: reference, extracted_text (Phase 5D).
"""
from __future__ import annotations
from types import SimpleNamespace
import pytest
from modules.serviceCenter.services.serviceGeneration.renderers.rendererPdf import (
REPORTLAB_AVAILABLE,
RendererPdf,
_normalizePdfMonospaceText,
_prepareCodeBlockPlainText,
)
# 1×1 transparent PNG
_MIN_PNG_B64 = (
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8BQDwAEhQGAhKmMIQAAAABJRU5ErkJggg=="
)
def _fakeServices():
"""RendererPdf calls services.utils.debugLogToFile; avoid None."""
def _noop(msg, tag=None):
pass
return SimpleNamespace(utils=SimpleNamespace(debugLogToFile=_noop))
def _fullDocumentJson() -> dict:
"""One document covering all supported content_type values plus reference/extracted_text elements."""
return {
"metadata": {
"split_strategy": "single_document",
"source_documents": [],
"extraction_method": "smoke_test",
"title": "PDF Renderer Smoke",
"language": "de",
},
"documents": [
{
"id": "doc_smoke",
"title": "PDF Renderer Smoke",
"filename": "pdf_renderer_smoke.pdf",
"sections": [
{
"id": "sec_h1",
"content_type": "heading",
"order": 1,
"elements": [
{
"content": {
"text": "H1 with **bold** and a very long subtitle line that should wrap cleanly without overlapping",
"level": 1,
}
}
],
},
{
"id": "sec_h2",
"content_type": "heading",
"order": 2,
"elements": [{"content": {"text": "H2 *italic* and `inline code`", "level": 2}}],
},
{
"id": "sec_para",
"content_type": "paragraph",
"order": 3,
"elements": [
{
"content": {
"text": (
"Paragraph: **strong**, *emphasis*, __under-like bold__, "
"_single underscores_, and `var = 1`."
)
}
}
],
},
{
"id": "sec_bullets",
"content_type": "bullet_list",
"order": 4,
"elements": [
{
"content": {
"items": [
"Bullet **one**",
{"text": "Bullet two with *italic*"},
],
"list_type": "bullet",
}
}
],
},
{
"id": "sec_numbered",
"content_type": "bullet_list",
"order": 5,
"elements": [
{
"content": {
"items": [{"text": "First numbered"}, {"text": "Second **numbered**"}],
"list_type": "numbered",
}
}
],
},
{
"id": "sec_table",
"content_type": "table",
"order": 6,
"elements": [
{
"content": {
"headers": ["Col A", "Col B", "Col C"],
"rows": [
["Short", "Medium length cell", "**Bold** in cell"],
["R2", "Data", "`code`"],
],
}
}
],
},
{
"id": "sec_code",
"content_type": "code_block",
"order": 7,
"elements": [
{
"content": {
"language": "python",
"code": (
'def hello():\n print("<tag> & ampersand")\n return 42\n'
"\n# tree (Unicode box drawing must not produce tofu in PDF)\n"
"Reports/\n\u251c\u2500\u2500 2025/\n\u2502 \u2514\u2500\u2500 file.txt\n"
),
}
}
],
},
{
"id": "sec_image",
"content_type": "image",
"order": 8,
"elements": [
{
"content": {
"base64Data": _MIN_PNG_B64,
"altText": "Smoke pixel",
"caption": "Minimal PNG (1×1)",
}
}
],
},
{
"id": "sec_reference",
"content_type": "paragraph",
"order": 9,
"elements": [
{
"type": "reference",
"label": "External spec",
"documentReference": "urn:smoke:ref",
}
],
},
{
"id": "sec_extracted",
"content_type": "paragraph",
"order": 10,
"elements": [
{
"type": "extracted_text",
"content": "Extracted **body** with formatting.",
"source": "fixture/source.md",
}
],
},
],
}
],
}
@pytest.mark.asyncio
async def test_renderer_pdf_all_json_elements(tmp_path):
if not REPORTLAB_AVAILABLE:
pytest.skip("reportlab is not installed")
renderer = RendererPdf(services=_fakeServices())
payload = _fullDocumentJson()
docs = await renderer.render(
extractedContent=payload,
title="PDF_Renderer_Smoke",
userPrompt=None,
aiService=None,
)
assert len(docs) == 1
out = docs[0]
assert out.mimeType == "application/pdf"
assert out.documentData[:4] == b"%PDF"
assert out.filename.endswith(".pdf")
outPath = tmp_path / "pdf_renderer_smoke.pdf"
outPath.write_bytes(out.documentData)
assert outPath.stat().st_size > 500
def test_prepare_code_block_preserves_indentation_spaces():
raw = "def x():\n return 1\n two leading on line"
assert " return" in _prepareCodeBlockPlainText(raw)
assert "\t" not in _prepareCodeBlockPlainText("a\tb")
def test_normalize_pdf_monospace_replaces_box_drawing():
raw = "\u2500\u2502\u251c\u2514\u252c\nReports/\n"
norm = _normalizePdfMonospaceText(raw)
assert "\u2500" not in norm
assert "\u2502" not in norm
assert "Reports/" in norm
def test_pdf_heading_font_sizes_strictly_decrease():
"""H3 must not fall back to H1 styles (previous bug: ## smaller than ###)."""
renderer = RendererPdf(services=_fakeServices())
styles = renderer._getDefaultStyleSet()
assert styles["heading1"]["font_size"] > styles["heading2"]["font_size"] > styles["heading3"]["font_size"]
assert renderer._defaultHeadingStyleDef(2)["font_size"] > renderer._defaultHeadingStyleDef(3)["font_size"]
if REPORTLAB_AVAILABLE:
s1 = renderer._createHeadingStyle(styles, 1).fontSize
s2 = renderer._createHeadingStyle(styles, 2).fontSize
s3 = renderer._createHeadingStyle(styles, 3).fontSize
assert s1 > s2 > s3
partial = {"heading1": styles["heading1"], "heading2": styles["heading2"]}
assert renderer._createHeadingStyle(partial, 3).fontSize < renderer._createHeadingStyle(partial, 2).fontSize
def test_inline_code_angle_brackets_escaped_in_font_span():
"""Paths like `.../<Slug>/` must not break ReportLab XML inside Courier."""
renderer = RendererPdf(services=_fakeServices())
xml = renderer._markdownInlineToReportlabXml("unter `Eingabe/<Slug>/` speichern")
assert 'name="Courier"' in xml
assert "&lt;Slug&gt;" in xml