278 lines
11 KiB
Python
278 lines
11 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
||
# All rights reserved.
|
||
"""
|
||
Smoke test: RendererPdf with every JSON section/element shape the pipeline supports.
|
||
|
||
Canonical section types (datamodelJson.supportedSectionTypes): table, bullet_list, heading,
|
||
paragraph, code_block, image.
|
||
|
||
PDF renderer additionally handles element types: reference, extracted_text (Phase 5D).
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from types import SimpleNamespace
|
||
|
||
import pytest
|
||
|
||
from modules.serviceCenter.services.serviceGeneration.renderers.rendererPdf import (
|
||
REPORTLAB_AVAILABLE,
|
||
RendererPdf,
|
||
_normalizePdfMonospaceText,
|
||
_prepareCodeBlockPlainText,
|
||
)
|
||
|
||
# 1×1 transparent PNG
|
||
_MIN_PNG_B64 = (
|
||
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8BQDwAEhQGAhKmMIQAAAABJRU5ErkJggg=="
|
||
)
|
||
|
||
|
||
def _fakeServices():
|
||
"""RendererPdf calls services.utils.debugLogToFile; avoid None."""
|
||
|
||
def _noop(msg, tag=None):
|
||
pass
|
||
|
||
return SimpleNamespace(utils=SimpleNamespace(debugLogToFile=_noop))
|
||
|
||
|
||
def _fullDocumentJson() -> dict:
|
||
"""One document covering all supported content_type values plus reference/extracted_text elements."""
|
||
return {
|
||
"metadata": {
|
||
"split_strategy": "single_document",
|
||
"source_documents": [],
|
||
"extraction_method": "smoke_test",
|
||
"title": "PDF Renderer Smoke",
|
||
"language": "de",
|
||
},
|
||
"documents": [
|
||
{
|
||
"id": "doc_smoke",
|
||
"title": "PDF Renderer Smoke",
|
||
"filename": "pdf_renderer_smoke.pdf",
|
||
"sections": [
|
||
{
|
||
"id": "sec_h1",
|
||
"content_type": "heading",
|
||
"order": 1,
|
||
"elements": [
|
||
{
|
||
"content": {
|
||
"text": "H1 with **bold** and a very long subtitle line that should wrap cleanly without overlapping",
|
||
"level": 1,
|
||
}
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"id": "sec_h2",
|
||
"content_type": "heading",
|
||
"order": 2,
|
||
"elements": [{"content": {"text": "H2 *italic* and `inline code`", "level": 2}}],
|
||
},
|
||
{
|
||
"id": "sec_para",
|
||
"content_type": "paragraph",
|
||
"order": 3,
|
||
"elements": [
|
||
{
|
||
"content": {
|
||
"text": (
|
||
"Paragraph: **strong**, *emphasis*, __under-like bold__, "
|
||
"_single underscores_, and `var = 1`."
|
||
)
|
||
}
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"id": "sec_bullets",
|
||
"content_type": "bullet_list",
|
||
"order": 4,
|
||
"elements": [
|
||
{
|
||
"content": {
|
||
"items": [
|
||
"Bullet **one**",
|
||
{"text": "Bullet two with *italic*"},
|
||
],
|
||
"list_type": "bullet",
|
||
}
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"id": "sec_numbered",
|
||
"content_type": "bullet_list",
|
||
"order": 5,
|
||
"elements": [
|
||
{
|
||
"content": {
|
||
"items": [{"text": "First numbered"}, {"text": "Second **numbered**"}],
|
||
"list_type": "numbered",
|
||
}
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"id": "sec_table",
|
||
"content_type": "table",
|
||
"order": 6,
|
||
"elements": [
|
||
{
|
||
"content": {
|
||
"headers": ["Col A", "Col B", "Col C"],
|
||
"rows": [
|
||
["Short", "Medium length cell", "**Bold** in cell"],
|
||
["R2", "Data", "`code`"],
|
||
],
|
||
}
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"id": "sec_code",
|
||
"content_type": "code_block",
|
||
"order": 7,
|
||
"elements": [
|
||
{
|
||
"content": {
|
||
"language": "python",
|
||
"code": (
|
||
'def hello():\n print("<tag> & ampersand")\n return 42\n'
|
||
"\n# tree (Unicode box drawing must not produce tofu in PDF)\n"
|
||
"Reports/\n\u251c\u2500\u2500 2025/\n\u2502 \u2514\u2500\u2500 file.txt\n"
|
||
),
|
||
}
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"id": "sec_image",
|
||
"content_type": "image",
|
||
"order": 8,
|
||
"elements": [
|
||
{
|
||
"content": {
|
||
"base64Data": _MIN_PNG_B64,
|
||
"altText": "Smoke pixel",
|
||
"caption": "Minimal PNG (1×1)",
|
||
}
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"id": "sec_reference",
|
||
"content_type": "paragraph",
|
||
"order": 9,
|
||
"elements": [
|
||
{
|
||
"type": "reference",
|
||
"label": "External spec",
|
||
"documentReference": "urn:smoke:ref",
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"id": "sec_extracted",
|
||
"content_type": "paragraph",
|
||
"order": 10,
|
||
"elements": [
|
||
{
|
||
"type": "extracted_text",
|
||
"content": "Extracted **body** with formatting.",
|
||
"source": "fixture/source.md",
|
||
}
|
||
],
|
||
},
|
||
],
|
||
}
|
||
],
|
||
}
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_renderer_pdf_all_json_elements(tmp_path):
|
||
if not REPORTLAB_AVAILABLE:
|
||
pytest.skip("reportlab is not installed")
|
||
renderer = RendererPdf(services=_fakeServices())
|
||
payload = _fullDocumentJson()
|
||
docs = await renderer.render(
|
||
extractedContent=payload,
|
||
title="PDF_Renderer_Smoke",
|
||
userPrompt=None,
|
||
aiService=None,
|
||
)
|
||
assert len(docs) == 1
|
||
out = docs[0]
|
||
assert out.mimeType == "application/pdf"
|
||
assert out.documentData[:4] == b"%PDF"
|
||
assert out.filename.endswith(".pdf")
|
||
|
||
outPath = tmp_path / "pdf_renderer_smoke.pdf"
|
||
outPath.write_bytes(out.documentData)
|
||
assert outPath.stat().st_size > 500
|
||
|
||
|
||
def test_prepare_code_block_preserves_indentation_spaces():
|
||
raw = "def x():\n return 1\n two leading on line"
|
||
assert " return" in _prepareCodeBlockPlainText(raw)
|
||
assert "\t" not in _prepareCodeBlockPlainText("a\tb")
|
||
|
||
|
||
def test_normalize_pdf_monospace_replaces_box_drawing():
|
||
raw = "\u2500\u2502\u251c\u2514\u252c\nReports/\n"
|
||
norm = _normalizePdfMonospaceText(raw)
|
||
assert "\u2500" not in norm
|
||
assert "\u2502" not in norm
|
||
assert "Reports/" in norm
|
||
|
||
|
||
def test_pdf_heading_font_sizes_strictly_decrease():
|
||
"""H3 must not fall back to H1 styles (previous bug: ## smaller than ###)."""
|
||
renderer = RendererPdf(services=_fakeServices())
|
||
styles = renderer._getDefaultStyleSet()
|
||
assert styles["heading1"]["font_size"] > styles["heading2"]["font_size"] > styles["heading3"]["font_size"]
|
||
assert renderer._defaultHeadingStyleDef(2)["font_size"] > renderer._defaultHeadingStyleDef(3)["font_size"]
|
||
if REPORTLAB_AVAILABLE:
|
||
s1 = renderer._createHeadingStyle(styles, 1).fontSize
|
||
s2 = renderer._createHeadingStyle(styles, 2).fontSize
|
||
s3 = renderer._createHeadingStyle(styles, 3).fontSize
|
||
assert s1 > s2 > s3
|
||
partial = {"heading1": styles["heading1"], "heading2": styles["heading2"]}
|
||
assert renderer._createHeadingStyle(partial, 3).fontSize < renderer._createHeadingStyle(partial, 2).fontSize
|
||
|
||
|
||
def test_inline_code_angle_brackets_escaped_in_font_span():
|
||
"""Paths like `.../<Slug>/` must not break ReportLab XML inside Courier."""
|
||
renderer = RendererPdf(services=_fakeServices())
|
||
xml = renderer._markdownInlineToReportlabXml("unter `Eingabe/<Slug>/` speichern")
|
||
assert 'name="Courier"' in xml
|
||
assert "<Slug>" in xml
|
||
|
||
|
||
def test_emoji_codepoints_wrapped_in_emoji_font_span():
|
||
"""Emoji codepoints must be wrapped in <font name="NotoEmoji">…</font> so
|
||
ReportLab swaps to the Noto Emoji TTF instead of producing missing-glyph squares."""
|
||
if not REPORTLAB_AVAILABLE:
|
||
pytest.skip("reportlab is not installed")
|
||
renderer = RendererPdf(services=_fakeServices())
|
||
xml = renderer._markdownInlineToReportlabXml("Status: \U0001F600 done \U0001F389")
|
||
# Either the font registered (preferred) and wrapped, or font missing and
|
||
# text passes through unchanged. Both branches must keep the body readable.
|
||
from modules.serviceCenter.services.serviceGeneration.renderers._pdfFontFallback import (
|
||
_initialize as _emojiInit,
|
||
)
|
||
if _emojiInit():
|
||
assert 'name="NotoEmoji"' in xml
|
||
assert "\U0001F600" in xml
|
||
assert "\U0001F389" in xml
|
||
else:
|
||
assert "\U0001F600" in xml
|
||
# Bold + emoji must produce nested font tag inside <b>...</b>
|
||
xmlBold = renderer._markdownInlineToReportlabXml("**OK \U00002705**")
|
||
assert "<b>" in xmlBold and "</b>" in xmlBold
|
||
if _emojiInit():
|
||
assert 'name="NotoEmoji"' in xmlBold
|