gateway/modules/datamodels/datamodelContent.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Content Object data models for the container and content extraction pipeline.

Physical layer: Container hierarchy (ZIP, Folder, File)
Logical layer: Scalar content objects (text, image, videostream, audiostream, other)

The entire extraction pipeline up to ContentObjects runs without AI.
"""

from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field
import uuid


class ContainerLimitError(Exception):
    """Raised when container extraction exceeds safety limits (size, depth, file count)."""
    pass


class ContentContextRef(BaseModel):
    """Reference to the origin context within a container/file."""
    containerPath: str = Field(description="e.g. 'archiv.zip/folder-a/report.pdf'")
    location: str = Field(default="", description="e.g. 'page:5/region:bottomLeft'")
    label: Optional[str] = Field(default=None, description="e.g. 'Abbildung 3: Uebersicht'")
    pageIndex: Optional[int] = Field(default=None, description="Page number (PDF, DOCX)")
    sectionId: Optional[str] = Field(default=None, description="Section/Heading ID")
    sheetName: Optional[str] = Field(default=None, description="Sheet name (XLSX)")
    slideIndex: Optional[int] = Field(default=None, description="Slide number (PPTX)")


class ContentObject(BaseModel):
    """Scalar content object extracted from a file. No AI involved."""
    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    fileId: str = Field(description="FK to the physical file")
    contentType: str = Field(description="text, image, videostream, audiostream, other")
    data: str = Field(default="", description="Content data (text, base64, URL)")
    contextRef: ContentContextRef = Field(default_factory=ContentContextRef)
    metadata: Dict[str, Any] = Field(default_factory=dict)
    sequence: int = Field(default=0, description="Order within the context")


class ContentObjectSummary(BaseModel):
    """Compact description of a content object for the FileContentIndex."""
    id: str = Field(description="Content object ID")
    contentType: str = Field(description="text, image, videostream, audiostream, other")
    contextRef: ContentContextRef = Field(default_factory=ContentContextRef)
    charCount: Optional[int] = Field(default=None, description="Only for text")
    dimensions: Optional[str] = Field(default=None, description="Only for image/video (e.g. '1920x1080')")
    duration: Optional[float] = Field(default=None, description="Only for audio/video (seconds)")


class FileEntry(BaseModel):
    """A file extracted from a container (ZIP, TAR, Folder)."""
    path: str = Field(description="Relative path within the container")
    data: bytes = Field(description="File content bytes")
    mimeType: str = Field(description="Detected MIME type")
    size: int = Field(description="File size in bytes")