gateway/modules/datamodels/datamodelUdm.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Unified Document Model (UDM) — hierarchical document tree and ContentPart bridge."""
from __future__ import annotations

import uuid
from typing import Any, Dict, List, Literal, Optional, Tuple, Union

from pydantic import BaseModel, Field

from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart


class UdmMetadata(BaseModel):
    title: Optional[str] = None
    author: Optional[str] = None
    createdAt: Optional[str] = None
    modifiedAt: Optional[str] = None
    sourcePath: str = ""
    tags: List[str] = Field(default_factory=list)
    custom: Dict[str, Any] = Field(default_factory=dict)


class UdmBoundingBox(BaseModel):
    x: float = 0.0
    y: float = 0.0
    width: float = 0.0
    height: float = 0.0
    unit: Literal["px", "pt", "mm"] = "pt"


class UdmPosition(BaseModel):
    index: int = 0
    page: Optional[int] = None
    row: Optional[int] = None
    col: Optional[int] = None
    bbox: Optional[UdmBoundingBox] = None


class UdmContentBlock(BaseModel):
    id: str
    contentType: Literal["text", "image", "table", "code", "media", "link", "formula"]
    raw: str = ""
    fileRef: Optional[str] = None
    mimeType: Optional[str] = None
    language: Optional[str] = None
    attributes: Dict[str, Any] = Field(default_factory=dict)
    position: UdmPosition = Field(default_factory=lambda: UdmPosition(index=0))
    metadata: UdmMetadata = Field(default_factory=UdmMetadata)


class UdmStructuralNode(BaseModel):
    id: str
    role: Literal["page", "section", "slide", "sheet"]
    index: int
    label: Optional[str] = None
    metadata: UdmMetadata = Field(default_factory=UdmMetadata)
    children: List[UdmContentBlock] = Field(default_factory=list)


class UdmDocument(BaseModel):
    id: str
    role: Literal["document"] = "document"
    sourceType: Literal["pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"] = "unknown"
    sourcePath: str = ""
    metadata: UdmMetadata = Field(default_factory=UdmMetadata)
    children: List[UdmStructuralNode] = Field(default_factory=list)


class UdmArchive(BaseModel):
    id: str
    role: Literal["archive"] = "archive"
    sourceType: Literal["zip", "tar", "gz", "unknown"] = "unknown"
    sourcePath: str = ""
    metadata: UdmMetadata = Field(default_factory=UdmMetadata)
    children: List[Union[UdmArchive, UdmDocument]] = Field(default_factory=list)


def _newId() -> str:
    return str(uuid.uuid4())


def _mapTypeGroupToContentType(typeGroup: str) -> Literal["text", "image", "table", "code", "media", "link", "formula"]:
    if typeGroup == "image":
        return "image"
    if typeGroup == "table":
        return "table"
    if typeGroup in ("code",):
        return "code"
    if typeGroup in ("binary", "audiostream", "videostream"):
        return "media"
    if typeGroup in ("structure", "text", "container"):
        return "text"
    return "text"


def _contentPartToBlock(part: ContentPart, blockIndex: int) -> UdmContentBlock:
    meta = part.metadata or {}
    ctx = meta.get("contextRef") or {}
    if not isinstance(ctx, dict):
        ctx = {}
    page = meta.get("pageIndex")
    if page is None:
        page = ctx.get("pageIndex")
    slide = meta.get("slide_number")
    if slide is None:
        slide = ctx.get("slideIndex")
    pos = UdmPosition(
        index=blockIndex,
        page=int(page) + 1 if isinstance(page, int) else None,
    )
    extraAttr: Dict[str, Any] = {}
    if isinstance(slide, int):
        extraAttr["slideIndex"] = slide
    return UdmContentBlock(
        id=part.id,
        contentType=_mapTypeGroupToContentType(part.typeGroup),
        raw=part.data or "",
        mimeType=part.mimeType or None,
        attributes={
            "typeGroup": part.typeGroup,
            "label": part.label,
            "parentId": part.parentId,
            **({"contextRef": ctx} if ctx else {}),
            **extraAttr,
        },
        position=pos,
        metadata=UdmMetadata(
            sourcePath=meta.get("containerPath", "") or "",
            custom={k: v for k, v in meta.items() if k not in ("contextRef",)},
        ),
    )


def _groupKeyForPart(part: ContentPart) -> Tuple[str, int, str]:
    """Return (role, structural_index, label) for grouping parts into structural nodes."""
    meta = part.metadata or {}
    ctx = meta.get("contextRef") or {}
    if not isinstance(ctx, dict):
        ctx = {}

    if "pageIndex" in meta or "pageIndex" in ctx:
        pi = meta.get("pageIndex", ctx.get("pageIndex", 0))
        try:
            idx = int(pi)
        except (TypeError, ValueError):
            idx = 0
        return ("page", idx, f"page_{idx + 1}")

    if meta.get("slide_number") is not None:
        try:
            idx = int(meta["slide_number"]) - 1
        except (TypeError, ValueError):
            idx = 0
        return ("slide", max(0, idx), f"slide_{idx + 1}")
    if ctx.get("slideIndex") is not None:
        try:
            idx = int(ctx.get("slideIndex", 0))
        except (TypeError, ValueError):
            idx = 0
        return ("slide", max(0, idx), f"slide_{idx + 1}")

    if meta.get("sheet") or ctx.get("sheetName"):
        name = str(meta.get("sheet") or ctx.get("sheetName") or "sheet")
        return ("sheet", abs(hash(name)) % (10**9), name)

    if ctx.get("sectionId") or meta.get("sectionId"):
        sid = str(ctx.get("sectionId") or meta.get("sectionId") or "section")
        return ("section", abs(hash(sid)) % (10**9), sid)

    if part.typeGroup == "container":
        return ("section", 0, "root")

    return ("section", 0, "body")


_VALID_DOC_SOURCES = frozenset({"pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"})


def contentPartsToUdm(extracted: ContentExtracted, sourceType: str, sourcePath: str) -> UdmDocument:
    """Convert flat ContentPart list into a UdmDocument using structural heuristics."""
    parts = list(extracted.parts or [])
    st: Literal["pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"] = (
        sourceType if sourceType in _VALID_DOC_SOURCES else "unknown"  # type: ignore[assignment]
    )
    doc = UdmDocument(
        id=extracted.id or _newId(),
        sourceType=st,
        sourcePath=sourcePath,
        metadata=UdmMetadata(sourcePath=sourcePath),
    )

    if not parts:
        return doc

    skipIds = set()
    rootIds = set()
    for p in parts:
        if p.typeGroup == "container" and p.parentId is None:
            rootIds.add(p.id)
            skipIds.add(p.id)

    contentParts = [p for p in parts if p.id not in skipIds and p.typeGroup != "container"]

    if not contentParts:
        for p in parts:
            if p.id not in skipIds:
                contentParts.append(p)

    if not contentParts:
        return doc

    groups: Dict[Tuple[str, int, str], List[ContentPart]] = {}
    for p in contentParts:
        key = _groupKeyForPart(p)
        groups.setdefault(key, []).append(p)

    sortedKeys = sorted(groups.keys(), key=lambda k: (k[0], k[1], k[2]))
    for gi, key in enumerate(sortedKeys):
        role, structIdx, label = key
        plist = groups[key]
        node = UdmStructuralNode(
            id=_newId(),
            role=role if role in ("page", "section", "slide", "sheet") else "section",
            index=gi if role == "section" else structIdx,
            label=label,
            metadata=UdmMetadata(sourcePath=sourcePath),
        )
        for bi, part in enumerate(plist):
            node.children.append(_contentPartToBlock(part, bi))
        doc.children.append(node)

    return doc


def _udmToContentParts(document: UdmDocument) -> ContentExtracted:
    """Flatten UdmDocument back to ContentExtracted for backward compatibility."""
    rootId = _newId()
    parts: List[ContentPart] = [
        ContentPart(
            id=rootId,
            parentId=None,
            label=document.sourceType or "document",
            typeGroup="container",
            mimeType="application/octet-stream",
            data="",
            metadata={"udmRoot": True, "sourcePath": document.sourcePath},
        )
    ]
    for sn in document.children:
        for block in sn.children:
            meta = dict(block.metadata.custom) if block.metadata else {}
            meta.setdefault("structuralRole", sn.role)
            meta.setdefault("structuralIndex", sn.index)
            parts.append(
                ContentPart(
                    id=block.id,
                    parentId=rootId,
                    label=block.attributes.get("label", sn.label or ""),
                    typeGroup=str(block.attributes.get("typeGroup", "text")),
                    mimeType=block.mimeType or "text/plain",
                    data=block.raw,
                    metadata=meta,
                )
            )
    return ContentExtracted(id=document.id, parts=parts)


def _stripUdmRaw(udm: UdmDocument) -> UdmDocument:
    """Return a deep copy with all content block `raw` cleared (structure-only preview)."""
    clone = udm.model_copy(deep=True)
    for sn in clone.children:
        for block in sn.children:
            block.raw = ""
    return clone


def _stripUdmForReferences(udm: UdmDocument) -> UdmDocument:
    """Clear inline payloads; keep `fileRef` when already set in attributes/metadata."""
    clone = udm.model_copy(deep=True)
    for sn in clone.children:
        for block in sn.children:
            block.raw = ""
            if not block.fileRef:
                ref = block.attributes.get("fileRef")
                if block.metadata and block.metadata.custom:
                    ref = ref or block.metadata.custom.get("fileRef")
                if isinstance(ref, str) and ref:
                    block.fileRef = ref
    return clone


def applyUdmOutputDetail(udm: UdmDocument, detail: str) -> UdmDocument:
    if detail == "structure":
        return _stripUdmRaw(udm)
    if detail == "references":
        return _stripUdmForReferences(udm)
    return udm


def mimeToUdmSourceType(mimeType: str, fileName: str) -> Literal["pdf", "docx", "pptx", "xlsx", "html", "binary", "unknown"]:
    m = (mimeType or "").lower()
    fn = (fileName or "").lower()
    if m == "application/pdf" or fn.endswith(".pdf"):
        return "pdf"
    if "wordprocessingml" in m or fn.endswith(".docx"):
        return "docx"
    if "presentationml" in m or fn.endswith((".pptx", ".ppt")):
        return "pptx"
    if "spreadsheetml" in m or fn.endswith((".xlsx", ".xlsm")):
        return "xlsx"
    if m == "text/html" or fn.endswith((".html", ".htm")):
        return "html"
    if m == "application/octet-stream" or not m:
        return "binary"
    return "unknown"