58 lines
2.8 KiB
Python
58 lines
2.8 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Content Object data models for the container and content extraction pipeline.
|
|
|
|
Physical layer: Container hierarchy (ZIP, Folder, File)
|
|
Logical layer: Scalar content objects (text, image, videostream, audiostream, other)
|
|
|
|
The entire extraction pipeline up to ContentObjects runs without AI.
|
|
"""
|
|
|
|
from typing import Dict, Any, List, Optional
|
|
from pydantic import BaseModel, Field
|
|
import uuid
|
|
|
|
|
|
class ContainerLimitError(Exception):
|
|
"""Raised when container extraction exceeds safety limits (size, depth, file count)."""
|
|
pass
|
|
|
|
|
|
class ContentContextRef(BaseModel):
|
|
"""Reference to the origin context within a container/file."""
|
|
containerPath: str = Field(description="e.g. 'archiv.zip/folder-a/report.pdf'")
|
|
location: str = Field(default="", description="e.g. 'page:5/region:bottomLeft'")
|
|
label: Optional[str] = Field(default=None, description="e.g. 'Abbildung 3: Uebersicht'")
|
|
pageIndex: Optional[int] = Field(default=None, description="Page number (PDF, DOCX)")
|
|
sectionId: Optional[str] = Field(default=None, description="Section/Heading ID")
|
|
sheetName: Optional[str] = Field(default=None, description="Sheet name (XLSX)")
|
|
slideIndex: Optional[int] = Field(default=None, description="Slide number (PPTX)")
|
|
|
|
|
|
class ContentObject(BaseModel):
|
|
"""Scalar content object extracted from a file. No AI involved."""
|
|
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
fileId: str = Field(description="FK to the physical file")
|
|
contentType: str = Field(description="text, image, videostream, audiostream, other")
|
|
data: str = Field(default="", description="Content data (text, base64, URL)")
|
|
contextRef: ContentContextRef = Field(default_factory=ContentContextRef)
|
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
sequence: int = Field(default=0, description="Order within the context")
|
|
|
|
|
|
class ContentObjectSummary(BaseModel):
|
|
"""Compact description of a content object for the FileContentIndex."""
|
|
id: str = Field(description="Content object ID")
|
|
contentType: str = Field(description="text, image, videostream, audiostream, other")
|
|
contextRef: ContentContextRef = Field(default_factory=ContentContextRef)
|
|
charCount: Optional[int] = Field(default=None, description="Only for text")
|
|
dimensions: Optional[str] = Field(default=None, description="Only for image/video (e.g. '1920x1080')")
|
|
duration: Optional[float] = Field(default=None, description="Only for audio/video (seconds)")
|
|
|
|
|
|
class FileEntry(BaseModel):
|
|
"""A file extracted from a container (ZIP, TAR, Folder)."""
|
|
path: str = Field(description="Relative path within the container")
|
|
data: bytes = Field(description="File content bytes")
|
|
mimeType: str = Field(description="Detected MIME type")
|
|
size: int = Field(description="File size in bytes")
|