gateway/tests/unit/datamodels/test_udm_bridge.py
2026-04-16 23:13:05 +02:00

69 lines
2 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
from modules.datamodels.datamodelUdm import _contentPartsToUdm, _udmToContentParts
def test_bridge_pdf_like_pages():
root = "root-id"
parts = [
ContentPart(
id=root,
parentId=None,
label="pdf",
typeGroup="container",
mimeType="application/pdf",
data="",
metadata={},
),
ContentPart(
id="t1",
parentId=root,
label="page_1",
typeGroup="text",
mimeType="text/plain",
data="A",
metadata={"pageIndex": 0},
),
ContentPart(
id="t2",
parentId=root,
label="page_2",
typeGroup="text",
mimeType="text/plain",
data="B",
metadata={"pageIndex": 1},
),
]
extracted = ContentExtracted(id="ext1", parts=parts)
udm = _contentPartsToUdm(extracted, "pdf", "a.pdf")
assert udm.sourceType == "pdf"
assert len(udm.children) == 2
assert all(n.role == "page" for n in udm.children)
assert udm.children[0].children[0].raw == "A"
assert udm.children[1].children[0].raw == "B"
def test_udm_to_parts_roundtrip_preserves_ids():
udm = _contentPartsToUdm(
ContentExtracted(
id="e1",
parts=[
ContentPart(
id="p1",
parentId=None,
label="x",
typeGroup="text",
mimeType="text/plain",
data="hi",
metadata={"pageIndex": 0},
),
],
),
"unknown",
"f.txt",
)
back = _udmToContentParts(udm)
assert len(back.parts) >= 2
textParts = [p for p in back.parts if p.typeGroup == "text"]
assert any(p.data == "hi" for p in textParts)