# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Unified Document Model (UDM) — hierarchical document tree and ContentPart bridge.""" from __future__ import annotations import uuid from typing import Any, Dict, List, Literal, Optional, Tuple, Union from pydantic import BaseModel, Field from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart class UdmMetadata(BaseModel): title: Optional[str] = None author: Optional[str] = None createdAt: Optional[float] = Field(default=None, json_schema_extra={"frontend_type": "timestamp"}) modifiedAt: Optional[float] = Field(default=None, json_schema_extra={"frontend_type": "timestamp"}) sourcePath: str = "" tags: List[str] = Field(default_factory=list) custom: Dict[str, Any] = Field(default_factory=dict) class UdmBoundingBox(BaseModel): x: float = 0.0 y: float = 0.0 width: float = 0.0 height: float = 0.0 unit: Literal["px", "pt", "mm"] = "pt" class UdmPosition(BaseModel): index: int = 0 page: Optional[int] = None row: Optional[int] = None col: Optional[int] = None bbox: Optional[UdmBoundingBox] = None class UdmContentBlock(BaseModel): id: str contentType: Literal["text", "image", "table", "code", "media", "link", "formula"] raw: str = "" fileRef: Optional[str] = None mimeType: Optional[str] = None language: Optional[str] = None attributes: Dict[str, Any] = Field(default_factory=dict) position: UdmPosition = Field(default_factory=lambda: UdmPosition(index=0)) metadata: UdmMetadata = Field(default_factory=UdmMetadata) class UdmStructuralNode(BaseModel): id: str role: Literal["page", "section", "slide", "sheet"] index: int label: Optional[str] = None metadata: UdmMetadata = Field(default_factory=UdmMetadata) children: List[UdmContentBlock] = Field(default_factory=list) class UdmDocument(BaseModel): id: str role: Literal["document"] = "document" sourceType: Literal["pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"] = "unknown" sourcePath: str = "" metadata: UdmMetadata = Field(default_factory=UdmMetadata) children: List[UdmStructuralNode] = Field(default_factory=list) class UdmArchive(BaseModel): id: str role: Literal["archive"] = "archive" sourceType: Literal["zip", "tar", "gz", "unknown"] = "unknown" sourcePath: str = "" metadata: UdmMetadata = Field(default_factory=UdmMetadata) children: List[Union[UdmArchive, UdmDocument]] = Field(default_factory=list) def _newId() -> str: return str(uuid.uuid4()) def _mapTypeGroupToContentType(typeGroup: str) -> Literal["text", "image", "table", "code", "media", "link", "formula"]: if typeGroup == "image": return "image" if typeGroup == "table": return "table" if typeGroup in ("code",): return "code" if typeGroup in ("binary", "audiostream", "videostream"): return "media" if typeGroup in ("structure", "text", "container"): return "text" return "text" def _contentPartToBlock(part: ContentPart, blockIndex: int) -> UdmContentBlock: meta = part.metadata or {} ctx = meta.get("contextRef") or {} if not isinstance(ctx, dict): ctx = {} page = meta.get("pageIndex") if page is None: page = ctx.get("pageIndex") slide = meta.get("slide_number") if slide is None: slide = ctx.get("slideIndex") pos = UdmPosition( index=blockIndex, page=int(page) + 1 if isinstance(page, int) else None, ) extraAttr: Dict[str, Any] = {} if isinstance(slide, int): extraAttr["slideIndex"] = slide return UdmContentBlock( id=part.id, contentType=_mapTypeGroupToContentType(part.typeGroup), raw=part.data or "", mimeType=part.mimeType or None, attributes={ "typeGroup": part.typeGroup, "label": part.label, "parentId": part.parentId, **({"contextRef": ctx} if ctx else {}), **extraAttr, }, position=pos, metadata=UdmMetadata( sourcePath=meta.get("containerPath", "") or "", custom={k: v for k, v in meta.items() if k not in ("contextRef",)}, ), ) def _groupKeyForPart(part: ContentPart) -> Tuple[str, int, str]: """Return (role, structural_index, label) for grouping parts into structural nodes.""" meta = part.metadata or {} ctx = meta.get("contextRef") or {} if not isinstance(ctx, dict): ctx = {} if "pageIndex" in meta or "pageIndex" in ctx: pi = meta.get("pageIndex", ctx.get("pageIndex", 0)) try: idx = int(pi) except (TypeError, ValueError): idx = 0 return ("page", idx, f"page_{idx + 1}") if meta.get("slide_number") is not None: try: idx = int(meta["slide_number"]) - 1 except (TypeError, ValueError): idx = 0 return ("slide", max(0, idx), f"slide_{idx + 1}") if ctx.get("slideIndex") is not None: try: idx = int(ctx.get("slideIndex", 0)) except (TypeError, ValueError): idx = 0 return ("slide", max(0, idx), f"slide_{idx + 1}") if meta.get("sheet") or ctx.get("sheetName"): name = str(meta.get("sheet") or ctx.get("sheetName") or "sheet") return ("sheet", abs(hash(name)) % (10**9), name) if ctx.get("sectionId") or meta.get("sectionId"): sid = str(ctx.get("sectionId") or meta.get("sectionId") or "section") return ("section", abs(hash(sid)) % (10**9), sid) if part.typeGroup == "container": return ("section", 0, "root") return ("section", 0, "body") _VALID_DOC_SOURCES = frozenset({"pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"}) def contentPartsToUdm(extracted: ContentExtracted, sourceType: str, sourcePath: str) -> UdmDocument: """Convert flat ContentPart list into a UdmDocument using structural heuristics.""" parts = list(extracted.parts or []) st: Literal["pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"] = ( sourceType if sourceType in _VALID_DOC_SOURCES else "unknown" # type: ignore[assignment] ) doc = UdmDocument( id=extracted.id or _newId(), sourceType=st, sourcePath=sourcePath, metadata=UdmMetadata(sourcePath=sourcePath), ) if not parts: return doc skipIds = set() rootIds = set() for p in parts: if p.typeGroup == "container" and p.parentId is None: rootIds.add(p.id) skipIds.add(p.id) contentParts = [p for p in parts if p.id not in skipIds and p.typeGroup != "container"] if not contentParts: for p in parts: if p.id not in skipIds: contentParts.append(p) if not contentParts: return doc groups: Dict[Tuple[str, int, str], List[ContentPart]] = {} for p in contentParts: key = _groupKeyForPart(p) groups.setdefault(key, []).append(p) sortedKeys = sorted(groups.keys(), key=lambda k: (k[0], k[1], k[2])) for gi, key in enumerate(sortedKeys): role, structIdx, label = key plist = groups[key] node = UdmStructuralNode( id=_newId(), role=role if role in ("page", "section", "slide", "sheet") else "section", index=gi if role == "section" else structIdx, label=label, metadata=UdmMetadata(sourcePath=sourcePath), ) for bi, part in enumerate(plist): node.children.append(_contentPartToBlock(part, bi)) doc.children.append(node) return doc def _udmToContentParts(document: UdmDocument) -> ContentExtracted: """Flatten UdmDocument back to ContentExtracted for backward compatibility.""" rootId = _newId() parts: List[ContentPart] = [ ContentPart( id=rootId, parentId=None, label=document.sourceType or "document", typeGroup="container", mimeType="application/octet-stream", data="", metadata={"udmRoot": True, "sourcePath": document.sourcePath}, ) ] for sn in document.children: for block in sn.children: meta = dict(block.metadata.custom) if block.metadata else {} meta.setdefault("structuralRole", sn.role) meta.setdefault("structuralIndex", sn.index) parts.append( ContentPart( id=block.id, parentId=rootId, label=block.attributes.get("label", sn.label or ""), typeGroup=str(block.attributes.get("typeGroup", "text")), mimeType=block.mimeType or "text/plain", data=block.raw, metadata=meta, ) ) return ContentExtracted(id=document.id, parts=parts) def _stripUdmRaw(udm: UdmDocument) -> UdmDocument: """Return a deep copy with all content block `raw` cleared (structure-only preview).""" clone = udm.model_copy(deep=True) for sn in clone.children: for block in sn.children: block.raw = "" return clone def _stripUdmForReferences(udm: UdmDocument) -> UdmDocument: """Clear inline payloads; keep `fileRef` when already set in attributes/metadata.""" clone = udm.model_copy(deep=True) for sn in clone.children: for block in sn.children: block.raw = "" if not block.fileRef: ref = block.attributes.get("fileRef") if block.metadata and block.metadata.custom: ref = ref or block.metadata.custom.get("fileRef") if isinstance(ref, str) and ref: block.fileRef = ref return clone def applyUdmOutputDetail(udm: UdmDocument, detail: str) -> UdmDocument: if detail == "structure": return _stripUdmRaw(udm) if detail == "references": return _stripUdmForReferences(udm) return udm def mimeToUdmSourceType(mimeType: str, fileName: str) -> Literal["pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"]: m = (mimeType or "").lower() fn = (fileName or "").lower() if m == "application/pdf" or fn.endswith(".pdf"): return "pdf" if "wordprocessingml" in m or fn.endswith(".docx"): return "docx" if "presentationml" in m or fn.endswith((".pptx", ".ppt")): return "pptx" if "spreadsheetml" in m or fn.endswith((".xlsx", ".xlsm")): return "xlsx" if m == "text/html" or fn.endswith((".html", ".htm")): return "html" if m == "application/octet-stream" or not m: return "binary" return "unknown"