gateway/tests/unit/services/test_renderer_pdf_smoke.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Smoke test: RendererPdf with every JSON section/element shape the pipeline supports.

Canonical section types (datamodelJson.supportedSectionTypes): table, bullet_list, heading,
paragraph, code_block, image.

PDF renderer additionally handles element types: reference, extracted_text (Phase 5D).
"""

from __future__ import annotations

from types import SimpleNamespace

import pytest

from modules.serviceCenter.services.serviceGeneration.renderers.rendererPdf import (
    REPORTLAB_AVAILABLE,
    RendererPdf,
    _normalizePdfMonospaceText,
    _prepareCodeBlockPlainText,
)

# 1×1 transparent PNG
_MIN_PNG_B64 = (
    "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8BQDwAEhQGAhKmMIQAAAABJRU5ErkJggg=="
)


def _fakeServices():
    """RendererPdf calls services.utils.debugLogToFile; avoid None."""

    def _noop(msg, tag=None):
        pass

    return SimpleNamespace(utils=SimpleNamespace(debugLogToFile=_noop))


def _fullDocumentJson() -> dict:
    """One document covering all supported content_type values plus reference/extracted_text elements."""
    return {
        "metadata": {
            "split_strategy": "single_document",
            "source_documents": [],
            "extraction_method": "smoke_test",
            "title": "PDF Renderer Smoke",
            "language": "de",
        },
        "documents": [
            {
                "id": "doc_smoke",
                "title": "PDF Renderer Smoke",
                "filename": "pdf_renderer_smoke.pdf",
                "sections": [
                    {
                        "id": "sec_h1",
                        "content_type": "heading",
                        "order": 1,
                        "elements": [
                            {
                                "content": {
                                    "text": "H1 with **bold** and a very long subtitle line that should wrap cleanly without overlapping",
                                    "level": 1,
                                }
                            }
                        ],
                    },
                    {
                        "id": "sec_h2",
                        "content_type": "heading",
                        "order": 2,
                        "elements": [{"content": {"text": "H2 *italic* and `inline code`", "level": 2}}],
                    },
                    {
                        "id": "sec_para",
                        "content_type": "paragraph",
                        "order": 3,
                        "elements": [
                            {
                                "content": {
                                    "text": (
                                        "Paragraph: **strong**, *emphasis*, __under-like bold__, "
                                        "_single underscores_, and `var = 1`."
                                    )
                                }
                            }
                        ],
                    },
                    {
                        "id": "sec_bullets",
                        "content_type": "bullet_list",
                        "order": 4,
                        "elements": [
                            {
                                "content": {
                                    "items": [
                                        "Bullet **one**",
                                        {"text": "Bullet two with *italic*"},
                                    ],
                                    "list_type": "bullet",
                                }
                            }
                        ],
                    },
                    {
                        "id": "sec_numbered",
                        "content_type": "bullet_list",
                        "order": 5,
                        "elements": [
                            {
                                "content": {
                                    "items": [{"text": "First numbered"}, {"text": "Second **numbered**"}],
                                    "list_type": "numbered",
                                }
                            }
                        ],
                    },
                    {
                        "id": "sec_table",
                        "content_type": "table",
                        "order": 6,
                        "elements": [
                            {
                                "content": {
                                    "headers": ["Col A", "Col B", "Col C"],
                                    "rows": [
                                        ["Short", "Medium length cell", "**Bold** in cell"],
                                        ["R2", "Data", "`code`"],
                                    ],
                                }
                            }
                        ],
                    },
                    {
                        "id": "sec_code",
                        "content_type": "code_block",
                        "order": 7,
                        "elements": [
                            {
                                "content": {
                                    "language": "python",
                                    "code": (
                                        'def hello():\n    print("<tag> & ampersand")\n    return 42\n'
                                        "\n# tree (Unicode box drawing must not produce tofu in PDF)\n"
                                        "Reports/\n\u251c\u2500\u2500 2025/\n\u2502   \u2514\u2500\u2500 file.txt\n"
                                    ),
                                }
                            }
                        ],
                    },
                    {
                        "id": "sec_image",
                        "content_type": "image",
                        "order": 8,
                        "elements": [
                            {
                                "content": {
                                    "base64Data": _MIN_PNG_B64,
                                    "altText": "Smoke pixel",
                                    "caption": "Minimal PNG (1×1)",
                                }
                            }
                        ],
                    },
                    {
                        "id": "sec_reference",
                        "content_type": "paragraph",
                        "order": 9,
                        "elements": [
                            {
                                "type": "reference",
                                "label": "External spec",
                                "documentReference": "urn:smoke:ref",
                            }
                        ],
                    },
                    {
                        "id": "sec_extracted",
                        "content_type": "paragraph",
                        "order": 10,
                        "elements": [
                            {
                                "type": "extracted_text",
                                "content": "Extracted **body** with formatting.",
                                "source": "fixture/source.md",
                            }
                        ],
                    },
                ],
            }
        ],
    }


@pytest.mark.asyncio
async def test_renderer_pdf_all_json_elements(tmp_path):
    if not REPORTLAB_AVAILABLE:
        pytest.skip("reportlab is not installed")
    renderer = RendererPdf(services=_fakeServices())
    payload = _fullDocumentJson()
    docs = await renderer.render(
        extractedContent=payload,
        title="PDF_Renderer_Smoke",
        userPrompt=None,
        aiService=None,
    )
    assert len(docs) == 1
    out = docs[0]
    assert out.mimeType == "application/pdf"
    assert out.documentData[:4] == b"%PDF"
    assert out.filename.endswith(".pdf")

    outPath = tmp_path / "pdf_renderer_smoke.pdf"
    outPath.write_bytes(out.documentData)
    assert outPath.stat().st_size > 500


def test_prepare_code_block_preserves_indentation_spaces():
    raw = "def x():\n    return 1\n  two leading on line"
    assert "    return" in _prepareCodeBlockPlainText(raw)
    assert "\t" not in _prepareCodeBlockPlainText("a\tb")


def test_normalize_pdf_monospace_replaces_box_drawing():
    raw = "\u2500\u2502\u251c\u2514\u252c\nReports/\n"
    norm = _normalizePdfMonospaceText(raw)
    assert "\u2500" not in norm
    assert "\u2502" not in norm
    assert "Reports/" in norm


def test_pdf_heading_font_sizes_strictly_decrease():
    """H3 must not fall back to H1 styles (previous bug: ## smaller than ###)."""
    renderer = RendererPdf(services=_fakeServices())
    styles = renderer._getDefaultStyleSet()
    assert styles["heading1"]["font_size"] > styles["heading2"]["font_size"] > styles["heading3"]["font_size"]
    assert renderer._defaultHeadingStyleDef(2)["font_size"] > renderer._defaultHeadingStyleDef(3)["font_size"]
    if REPORTLAB_AVAILABLE:
        s1 = renderer._createHeadingStyle(styles, 1).fontSize
        s2 = renderer._createHeadingStyle(styles, 2).fontSize
        s3 = renderer._createHeadingStyle(styles, 3).fontSize
        assert s1 > s2 > s3
        partial = {"heading1": styles["heading1"], "heading2": styles["heading2"]}
        assert renderer._createHeadingStyle(partial, 3).fontSize < renderer._createHeadingStyle(partial, 2).fontSize


def test_inline_code_angle_brackets_escaped_in_font_span():
    """Paths like `.../<Slug>/` must not break ReportLab XML inside Courier."""
    renderer = RendererPdf(services=_fakeServices())
    xml = renderer._markdownInlineToReportlabXml("unter `Eingabe/<Slug>/` speichern")
    assert 'name="Courier"' in xml
    assert "&lt;Slug&gt;" in xml


def test_emoji_codepoints_wrapped_in_emoji_font_span():
    """Emoji codepoints must be wrapped in <font name="NotoEmoji">…</font> so
    ReportLab swaps to the Noto Emoji TTF instead of producing missing-glyph squares."""
    if not REPORTLAB_AVAILABLE:
        pytest.skip("reportlab is not installed")
    renderer = RendererPdf(services=_fakeServices())
    xml = renderer._markdownInlineToReportlabXml("Status: \U0001F600 done \U0001F389")
    # Either the font registered (preferred) and wrapped, or font missing and
    # text passes through unchanged. Both branches must keep the body readable.
    from modules.serviceCenter.services.serviceGeneration.renderers._pdfFontFallback import (
        _initialize as _emojiInit,
    )
    if _emojiInit():
        assert 'name="NotoEmoji"' in xml
        assert "\U0001F600" in xml
        assert "\U0001F389" in xml
    else:
        assert "\U0001F600" in xml
    # Bold + emoji must produce nested font tag inside <b>...</b>
    xmlBold = renderer._markdownInlineToReportlabXml("**OK \U00002705**")
    assert "<b>" in xmlBold and "</b>" in xmlBold
    if _emojiInit():
        assert 'name="NotoEmoji"' in xmlBold