gateway/tests/unit/services/test_renderer_pdf_smoke.py
2026-03-22 11:09:48 +01:00

253 lines
9.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Smoke test: RendererPdf with every JSON section/element shape the pipeline supports.
Canonical section types (datamodelJson.supportedSectionTypes): table, bullet_list, heading,
paragraph, code_block, image.
PDF renderer additionally handles element types: reference, extracted_text (Phase 5D).
"""
from __future__ import annotations
from types import SimpleNamespace
import pytest
from modules.serviceCenter.services.serviceGeneration.renderers.rendererPdf import (
REPORTLAB_AVAILABLE,
RendererPdf,
_normalizePdfMonospaceText,
_prepareCodeBlockPlainText,
)
# 1×1 transparent PNG
_MIN_PNG_B64 = (
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8BQDwAEhQGAhKmMIQAAAABJRU5ErkJggg=="
)
def _fakeServices():
"""RendererPdf calls services.utils.debugLogToFile; avoid None."""
def _noop(msg, tag=None):
pass
return SimpleNamespace(utils=SimpleNamespace(debugLogToFile=_noop))
def _fullDocumentJson() -> dict:
"""One document covering all supported content_type values plus reference/extracted_text elements."""
return {
"metadata": {
"split_strategy": "single_document",
"source_documents": [],
"extraction_method": "smoke_test",
"title": "PDF Renderer Smoke",
"language": "de",
},
"documents": [
{
"id": "doc_smoke",
"title": "PDF Renderer Smoke",
"filename": "pdf_renderer_smoke.pdf",
"sections": [
{
"id": "sec_h1",
"content_type": "heading",
"order": 1,
"elements": [
{
"content": {
"text": "H1 with **bold** and a very long subtitle line that should wrap cleanly without overlapping",
"level": 1,
}
}
],
},
{
"id": "sec_h2",
"content_type": "heading",
"order": 2,
"elements": [{"content": {"text": "H2 *italic* and `inline code`", "level": 2}}],
},
{
"id": "sec_para",
"content_type": "paragraph",
"order": 3,
"elements": [
{
"content": {
"text": (
"Paragraph: **strong**, *emphasis*, __under-like bold__, "
"_single underscores_, and `var = 1`."
)
}
}
],
},
{
"id": "sec_bullets",
"content_type": "bullet_list",
"order": 4,
"elements": [
{
"content": {
"items": [
"Bullet **one**",
{"text": "Bullet two with *italic*"},
],
"list_type": "bullet",
}
}
],
},
{
"id": "sec_numbered",
"content_type": "bullet_list",
"order": 5,
"elements": [
{
"content": {
"items": [{"text": "First numbered"}, {"text": "Second **numbered**"}],
"list_type": "numbered",
}
}
],
},
{
"id": "sec_table",
"content_type": "table",
"order": 6,
"elements": [
{
"content": {
"headers": ["Col A", "Col B", "Col C"],
"rows": [
["Short", "Medium length cell", "**Bold** in cell"],
["R2", "Data", "`code`"],
],
}
}
],
},
{
"id": "sec_code",
"content_type": "code_block",
"order": 7,
"elements": [
{
"content": {
"language": "python",
"code": (
'def hello():\n print("<tag> & ampersand")\n return 42\n'
"\n# tree (Unicode box drawing must not produce tofu in PDF)\n"
"Reports/\n\u251c\u2500\u2500 2025/\n\u2502 \u2514\u2500\u2500 file.txt\n"
),
}
}
],
},
{
"id": "sec_image",
"content_type": "image",
"order": 8,
"elements": [
{
"content": {
"base64Data": _MIN_PNG_B64,
"altText": "Smoke pixel",
"caption": "Minimal PNG (1×1)",
}
}
],
},
{
"id": "sec_reference",
"content_type": "paragraph",
"order": 9,
"elements": [
{
"type": "reference",
"label": "External spec",
"documentReference": "urn:smoke:ref",
}
],
},
{
"id": "sec_extracted",
"content_type": "paragraph",
"order": 10,
"elements": [
{
"type": "extracted_text",
"content": "Extracted **body** with formatting.",
"source": "fixture/source.md",
}
],
},
],
}
],
}
@pytest.mark.asyncio
async def test_renderer_pdf_all_json_elements(tmp_path):
if not REPORTLAB_AVAILABLE:
pytest.skip("reportlab is not installed")
renderer = RendererPdf(services=_fakeServices())
payload = _fullDocumentJson()
docs = await renderer.render(
extractedContent=payload,
title="PDF_Renderer_Smoke",
userPrompt=None,
aiService=None,
)
assert len(docs) == 1
out = docs[0]
assert out.mimeType == "application/pdf"
assert out.documentData[:4] == b"%PDF"
assert out.filename.endswith(".pdf")
outPath = tmp_path / "pdf_renderer_smoke.pdf"
outPath.write_bytes(out.documentData)
assert outPath.stat().st_size > 500
def test_prepare_code_block_preserves_indentation_spaces():
raw = "def x():\n return 1\n two leading on line"
assert " return" in _prepareCodeBlockPlainText(raw)
assert "\t" not in _prepareCodeBlockPlainText("a\tb")
def test_normalize_pdf_monospace_replaces_box_drawing():
raw = "\u2500\u2502\u251c\u2514\u252c\nReports/\n"
norm = _normalizePdfMonospaceText(raw)
assert "\u2500" not in norm
assert "\u2502" not in norm
assert "Reports/" in norm
def test_pdf_heading_font_sizes_strictly_decrease():
"""H3 must not fall back to H1 styles (previous bug: ## smaller than ###)."""
renderer = RendererPdf(services=_fakeServices())
styles = renderer._getDefaultStyleSet()
assert styles["heading1"]["font_size"] > styles["heading2"]["font_size"] > styles["heading3"]["font_size"]
assert renderer._defaultHeadingStyleDef(2)["font_size"] > renderer._defaultHeadingStyleDef(3)["font_size"]
if REPORTLAB_AVAILABLE:
s1 = renderer._createHeadingStyle(styles, 1).fontSize
s2 = renderer._createHeadingStyle(styles, 2).fontSize
s3 = renderer._createHeadingStyle(styles, 3).fontSize
assert s1 > s2 > s3
partial = {"heading1": styles["heading1"], "heading2": styles["heading2"]}
assert renderer._createHeadingStyle(partial, 3).fontSize < renderer._createHeadingStyle(partial, 2).fontSize
def test_inline_code_angle_brackets_escaped_in_font_span():
"""Paths like `.../<Slug>/` must not break ReportLab XML inside Courier."""
renderer = RendererPdf(services=_fakeServices())
xml = renderer._markdownInlineToReportlabXml("unter `Eingabe/<Slug>/` speichern")
assert 'name="Courier"' in xml
assert "&lt;Slug&gt;" in xml