gateway/modules/datamodels/datamodelUdm.py
2026-04-26 08:31:35 +02:00

316 lines
10 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Unified Document Model (UDM) — hierarchical document tree and ContentPart bridge."""
from __future__ import annotations
import uuid
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
from pydantic import BaseModel, Field
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
class UdmMetadata(BaseModel):
title: Optional[str] = None
author: Optional[str] = None
createdAt: Optional[str] = None
modifiedAt: Optional[str] = None
sourcePath: str = ""
tags: List[str] = Field(default_factory=list)
custom: Dict[str, Any] = Field(default_factory=dict)
class UdmBoundingBox(BaseModel):
x: float = 0.0
y: float = 0.0
width: float = 0.0
height: float = 0.0
unit: Literal["px", "pt", "mm"] = "pt"
class UdmPosition(BaseModel):
index: int = 0
page: Optional[int] = None
row: Optional[int] = None
col: Optional[int] = None
bbox: Optional[UdmBoundingBox] = None
class UdmContentBlock(BaseModel):
id: str
contentType: Literal["text", "image", "table", "code", "media", "link", "formula"]
raw: str = ""
fileRef: Optional[str] = None
mimeType: Optional[str] = None
language: Optional[str] = None
attributes: Dict[str, Any] = Field(default_factory=dict)
position: UdmPosition = Field(default_factory=lambda: UdmPosition(index=0))
metadata: UdmMetadata = Field(default_factory=UdmMetadata)
class UdmStructuralNode(BaseModel):
id: str
role: Literal["page", "section", "slide", "sheet"]
index: int
label: Optional[str] = None
metadata: UdmMetadata = Field(default_factory=UdmMetadata)
children: List[UdmContentBlock] = Field(default_factory=list)
class UdmDocument(BaseModel):
id: str
role: Literal["document"] = "document"
sourceType: Literal["pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"] = "unknown"
sourcePath: str = ""
metadata: UdmMetadata = Field(default_factory=UdmMetadata)
children: List[UdmStructuralNode] = Field(default_factory=list)
class UdmArchive(BaseModel):
id: str
role: Literal["archive"] = "archive"
sourceType: Literal["zip", "tar", "gz", "unknown"] = "unknown"
sourcePath: str = ""
metadata: UdmMetadata = Field(default_factory=UdmMetadata)
children: List[Union[UdmArchive, UdmDocument]] = Field(default_factory=list)
def _newId() -> str:
return str(uuid.uuid4())
def _mapTypeGroupToContentType(typeGroup: str) -> Literal["text", "image", "table", "code", "media", "link", "formula"]:
if typeGroup == "image":
return "image"
if typeGroup == "table":
return "table"
if typeGroup in ("code",):
return "code"
if typeGroup in ("binary", "audiostream", "videostream"):
return "media"
if typeGroup in ("structure", "text", "container"):
return "text"
return "text"
def _contentPartToBlock(part: ContentPart, blockIndex: int) -> UdmContentBlock:
meta = part.metadata or {}
ctx = meta.get("contextRef") or {}
if not isinstance(ctx, dict):
ctx = {}
page = meta.get("pageIndex")
if page is None:
page = ctx.get("pageIndex")
slide = meta.get("slide_number")
if slide is None:
slide = ctx.get("slideIndex")
pos = UdmPosition(
index=blockIndex,
page=int(page) + 1 if isinstance(page, int) else None,
)
extraAttr: Dict[str, Any] = {}
if isinstance(slide, int):
extraAttr["slideIndex"] = slide
return UdmContentBlock(
id=part.id,
contentType=_mapTypeGroupToContentType(part.typeGroup),
raw=part.data or "",
mimeType=part.mimeType or None,
attributes={
"typeGroup": part.typeGroup,
"label": part.label,
"parentId": part.parentId,
**({"contextRef": ctx} if ctx else {}),
**extraAttr,
},
position=pos,
metadata=UdmMetadata(
sourcePath=meta.get("containerPath", "") or "",
custom={k: v for k, v in meta.items() if k not in ("contextRef",)},
),
)
def _groupKeyForPart(part: ContentPart) -> Tuple[str, int, str]:
"""Return (role, structural_index, label) for grouping parts into structural nodes."""
meta = part.metadata or {}
ctx = meta.get("contextRef") or {}
if not isinstance(ctx, dict):
ctx = {}
if "pageIndex" in meta or "pageIndex" in ctx:
pi = meta.get("pageIndex", ctx.get("pageIndex", 0))
try:
idx = int(pi)
except (TypeError, ValueError):
idx = 0
return ("page", idx, f"page_{idx + 1}")
if meta.get("slide_number") is not None:
try:
idx = int(meta["slide_number"]) - 1
except (TypeError, ValueError):
idx = 0
return ("slide", max(0, idx), f"slide_{idx + 1}")
if ctx.get("slideIndex") is not None:
try:
idx = int(ctx.get("slideIndex", 0))
except (TypeError, ValueError):
idx = 0
return ("slide", max(0, idx), f"slide_{idx + 1}")
if meta.get("sheet") or ctx.get("sheetName"):
name = str(meta.get("sheet") or ctx.get("sheetName") or "sheet")
return ("sheet", abs(hash(name)) % (10**9), name)
if ctx.get("sectionId") or meta.get("sectionId"):
sid = str(ctx.get("sectionId") or meta.get("sectionId") or "section")
return ("section", abs(hash(sid)) % (10**9), sid)
if part.typeGroup == "container":
return ("section", 0, "root")
return ("section", 0, "body")
_VALID_DOC_SOURCES = frozenset({"pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"})
def contentPartsToUdm(extracted: ContentExtracted, sourceType: str, sourcePath: str) -> UdmDocument:
"""Convert flat ContentPart list into a UdmDocument using structural heuristics."""
parts = list(extracted.parts or [])
st: Literal["pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"] = (
sourceType if sourceType in _VALID_DOC_SOURCES else "unknown" # type: ignore[assignment]
)
doc = UdmDocument(
id=extracted.id or _newId(),
sourceType=st,
sourcePath=sourcePath,
metadata=UdmMetadata(sourcePath=sourcePath),
)
if not parts:
return doc
skipIds = set()
rootIds = set()
for p in parts:
if p.typeGroup == "container" and p.parentId is None:
rootIds.add(p.id)
skipIds.add(p.id)
contentParts = [p for p in parts if p.id not in skipIds and p.typeGroup != "container"]
if not contentParts:
for p in parts:
if p.id not in skipIds:
contentParts.append(p)
if not contentParts:
return doc
groups: Dict[Tuple[str, int, str], List[ContentPart]] = {}
for p in contentParts:
key = _groupKeyForPart(p)
groups.setdefault(key, []).append(p)
sortedKeys = sorted(groups.keys(), key=lambda k: (k[0], k[1], k[2]))
for gi, key in enumerate(sortedKeys):
role, structIdx, label = key
plist = groups[key]
node = UdmStructuralNode(
id=_newId(),
role=role if role in ("page", "section", "slide", "sheet") else "section",
index=gi if role == "section" else structIdx,
label=label,
metadata=UdmMetadata(sourcePath=sourcePath),
)
for bi, part in enumerate(plist):
node.children.append(_contentPartToBlock(part, bi))
doc.children.append(node)
return doc
def _udmToContentParts(document: UdmDocument) -> ContentExtracted:
"""Flatten UdmDocument back to ContentExtracted for backward compatibility."""
rootId = _newId()
parts: List[ContentPart] = [
ContentPart(
id=rootId,
parentId=None,
label=document.sourceType or "document",
typeGroup="container",
mimeType="application/octet-stream",
data="",
metadata={"udmRoot": True, "sourcePath": document.sourcePath},
)
]
for sn in document.children:
for block in sn.children:
meta = dict(block.metadata.custom) if block.metadata else {}
meta.setdefault("structuralRole", sn.role)
meta.setdefault("structuralIndex", sn.index)
parts.append(
ContentPart(
id=block.id,
parentId=rootId,
label=block.attributes.get("label", sn.label or ""),
typeGroup=str(block.attributes.get("typeGroup", "text")),
mimeType=block.mimeType or "text/plain",
data=block.raw,
metadata=meta,
)
)
return ContentExtracted(id=document.id, parts=parts)
def _stripUdmRaw(udm: UdmDocument) -> UdmDocument:
"""Return a deep copy with all content block `raw` cleared (structure-only preview)."""
clone = udm.model_copy(deep=True)
for sn in clone.children:
for block in sn.children:
block.raw = ""
return clone
def _stripUdmForReferences(udm: UdmDocument) -> UdmDocument:
"""Clear inline payloads; keep `fileRef` when already set in attributes/metadata."""
clone = udm.model_copy(deep=True)
for sn in clone.children:
for block in sn.children:
block.raw = ""
if not block.fileRef:
ref = block.attributes.get("fileRef")
if block.metadata and block.metadata.custom:
ref = ref or block.metadata.custom.get("fileRef")
if isinstance(ref, str) and ref:
block.fileRef = ref
return clone
def applyUdmOutputDetail(udm: UdmDocument, detail: str) -> UdmDocument:
if detail == "structure":
return _stripUdmRaw(udm)
if detail == "references":
return _stripUdmForReferences(udm)
return udm
def mimeToUdmSourceType(mimeType: str, fileName: str) -> Literal["pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"]:
m = (mimeType or "").lower()
fn = (fileName or "").lower()
if m == "application/pdf" or fn.endswith(".pdf"):
return "pdf"
if "wordprocessingml" in m or fn.endswith(".docx"):
return "docx"
if "presentationml" in m or fn.endswith((".pptx", ".ppt")):
return "pptx"
if "spreadsheetml" in m or fn.endswith((".xlsx", ".xlsm")):
return "xlsx"
if m == "text/html" or fn.endswith((".html", ".htm")):
return "html"
if m == "application/octet-stream" or not m:
return "binary"
return "unknown"