69 lines
2 KiB
Python
69 lines
2 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
|
|
from modules.datamodels.datamodelUdm import contentPartsToUdm, _udmToContentParts
|
|
|
|
|
|
def test_bridge_pdf_like_pages():
|
|
root = "root-id"
|
|
parts = [
|
|
ContentPart(
|
|
id=root,
|
|
parentId=None,
|
|
label="pdf",
|
|
typeGroup="container",
|
|
mimeType="application/pdf",
|
|
data="",
|
|
metadata={},
|
|
),
|
|
ContentPart(
|
|
id="t1",
|
|
parentId=root,
|
|
label="page_1",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data="A",
|
|
metadata={"pageIndex": 0},
|
|
),
|
|
ContentPart(
|
|
id="t2",
|
|
parentId=root,
|
|
label="page_2",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data="B",
|
|
metadata={"pageIndex": 1},
|
|
),
|
|
]
|
|
extracted = ContentExtracted(id="ext1", parts=parts)
|
|
udm = contentPartsToUdm(extracted, "pdf", "a.pdf")
|
|
assert udm.sourceType == "pdf"
|
|
assert len(udm.children) == 2
|
|
assert all(n.role == "page" for n in udm.children)
|
|
assert udm.children[0].children[0].raw == "A"
|
|
assert udm.children[1].children[0].raw == "B"
|
|
|
|
|
|
def test_udm_to_parts_roundtrip_preserves_ids():
|
|
udm = contentPartsToUdm(
|
|
ContentExtracted(
|
|
id="e1",
|
|
parts=[
|
|
ContentPart(
|
|
id="p1",
|
|
parentId=None,
|
|
label="x",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data="hi",
|
|
metadata={"pageIndex": 0},
|
|
),
|
|
],
|
|
),
|
|
"unknown",
|
|
"f.txt",
|
|
)
|
|
back = _udmToContentParts(udm)
|
|
assert len(back.parts) >= 2
|
|
textParts = [p for p in back.parts if p.typeGroup == "text"]
|
|
assert any(p.data == "hi" for p in textParts)
|