# Copyright (c) 2025 Patrick Motsch # All rights reserved. from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart from modules.datamodels.datamodelUdm import contentPartsToUdm, _udmToContentParts def test_bridge_pdf_like_pages(): root = "root-id" parts = [ ContentPart( id=root, parentId=None, label="pdf", typeGroup="container", mimeType="application/pdf", data="", metadata={}, ), ContentPart( id="t1", parentId=root, label="page_1", typeGroup="text", mimeType="text/plain", data="A", metadata={"pageIndex": 0}, ), ContentPart( id="t2", parentId=root, label="page_2", typeGroup="text", mimeType="text/plain", data="B", metadata={"pageIndex": 1}, ), ] extracted = ContentExtracted(id="ext1", parts=parts) udm = contentPartsToUdm(extracted, "pdf", "a.pdf") assert udm.sourceType == "pdf" assert len(udm.children) == 2 assert all(n.role == "page" for n in udm.children) assert udm.children[0].children[0].raw == "A" assert udm.children[1].children[0].raw == "B" def test_udm_to_parts_roundtrip_preserves_ids(): udm = contentPartsToUdm( ContentExtracted( id="e1", parts=[ ContentPart( id="p1", parentId=None, label="x", typeGroup="text", mimeType="text/plain", data="hi", metadata={"pageIndex": 0}, ), ], ), "unknown", "f.txt", ) back = _udmToContentParts(udm) assert len(back.parts) >= 2 textParts = [p for p in back.parts if p.typeGroup == "text"] assert any(p.data == "hi" for p in textParts)