platform-core/modules/datamodels/datamodelContent.py
ValueOn AG 4a60086c80
Some checks failed
Deploy Plattform-Core (Int) / test (push) Failing after 15s
Deploy Plattform-Core (Int) / deploy (push) Has been skipped
cp adapted to 2026 poweron
2026-06-09 09:53:31 +02:00

61 lines
2.9 KiB
Python

# Copyright (c) 2026 PowerOn AG
# All rights reserved.
"""Content Object data models for the container and content extraction pipeline.
Physical layer: Container hierarchy (ZIP, Folder, File)
Logical layer: Scalar content objects (text, image, videostream, audiostream, other)
The entire extraction pipeline up to ContentObjects runs without AI.
"""
from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field
import uuid
class ContainerLimitError(Exception):
"""Raised when container extraction exceeds safety limits (size, depth, file count)."""
pass
class ContentContextRef(BaseModel):
"""Reference to the origin context within a container/file."""
containerPath: str = Field(description="e.g. 'archiv.zip/folder-a/report.pdf'")
location: str = Field(default="", description="e.g. 'page:5/region:bottomLeft'")
label: Optional[str] = Field(default=None, description="e.g. 'Abbildung 3: Uebersicht'")
pageIndex: Optional[int] = Field(default=None, description="Page number (PDF, DOCX)")
sectionId: Optional[str] = Field(default=None, description="Section/Heading ID")
sheetName: Optional[str] = Field(default=None, description="Sheet name (XLSX)")
slideIndex: Optional[int] = Field(default=None, description="Slide number (PPTX)")
class ContentObject(BaseModel):
"""Scalar content object extracted from a file. No AI involved."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
fileId: str = Field(
description="FK to the physical file",
json_schema_extra={"fk_target": {"db": "poweron_management", "table": "FileItem", "labelField": "fileName"}},
)
contentType: str = Field(description="text, image, videostream, audiostream, other")
data: str = Field(default="", description="Content data (text, base64, URL)")
contextRef: ContentContextRef = Field(default_factory=ContentContextRef)
metadata: Dict[str, Any] = Field(default_factory=dict)
sequence: int = Field(default=0, description="Order within the context")
class ContentObjectSummary(BaseModel):
"""Compact description of a content object for the FileContentIndex."""
id: str = Field(description="Content object ID")
contentType: str = Field(description="text, image, videostream, audiostream, other")
contextRef: ContentContextRef = Field(default_factory=ContentContextRef)
charCount: Optional[int] = Field(default=None, description="Only for text")
dimensions: Optional[str] = Field(default=None, description="Only for image/video (e.g. '1920x1080')")
duration: Optional[float] = Field(default=None, description="Only for audio/video (seconds)")
class FileEntry(BaseModel):
"""A file extracted from a container (ZIP, TAR, Folder)."""
path: str = Field(description="Relative path within the container")
data: bytes = Field(description="File content bytes")
mimeType: str = Field(description="Detected MIME type")
size: int = Field(description="File size in bytes")