gateway/modules/datamodels/datamodelDocument.py
2025-12-15 21:55:26 +01:00

111 lines
5 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List, Optional, Literal, Union
from pydantic import BaseModel, Field
from datetime import datetime
class DocumentMetadata(BaseModel):
"""Metadata for the entire document."""
title: str = Field(description="Document title")
author: Optional[str] = Field(default=None, description="Document author")
createdAt: datetime = Field(default_factory=datetime.now, description="Creation timestamp")
sourceDocuments: List[str] = Field(default_factory=list, description="Source document IDs")
extractionMethod: str = Field(default="ai_extraction", description="Method used for extraction")
version: str = Field(default="1.0", description="Document version")
class TableData(BaseModel):
"""Structured table data."""
headers: List[str] = Field(description="Table column headers")
rows: List[List[str]] = Field(description="Table data rows")
caption: Optional[str] = Field(default=None, description="Table caption")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Table metadata")
class ListItem(BaseModel):
"""Individual list item with optional sub-items."""
text: str = Field(description="List item text")
subitems: Optional[List['ListItem']] = Field(default=None, description="Nested sub-items")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Item metadata")
class BulletList(BaseModel):
"""Bulleted or numbered list."""
items: List[ListItem] = Field(description="List items")
listType: Literal["bullet", "numbered", "checklist"] = Field(default="bullet", description="List type")
metadata: Dict[str, Any] = Field(default_factory=dict, description="List metadata")
class Paragraph(BaseModel):
"""Text paragraph with optional formatting."""
text: str = Field(description="Paragraph text")
formatting: Optional[Dict[str, Any]] = Field(default=None, description="Text formatting (bold, italic, etc.)")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Paragraph metadata")
class Heading(BaseModel):
"""Document heading."""
text: str = Field(description="Heading text")
level: int = Field(ge=1, le=6, description="Heading level (1-6)")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Heading metadata")
class CodeBlock(BaseModel):
"""Code block with syntax highlighting."""
code: str = Field(description="Code content")
language: Optional[str] = Field(default=None, description="Programming language")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Code block metadata")
class Image(BaseModel):
"""Image with metadata."""
data: str = Field(description="Base64 encoded image data")
altText: Optional[str] = Field(default=None, description="Alternative text")
caption: Optional[str] = Field(default=None, description="Image caption")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Image metadata")
class DocumentSection(BaseModel):
"""A section of the document containing one or more content elements."""
id: str = Field(description="Unique section identifier")
title: Optional[str] = Field(default=None, description="Section title")
contentType: Literal["table", "list", "paragraph", "heading", "code", "image", "mixed"] = Field(description="Primary content type")
elements: List[Union[TableData, BulletList, Paragraph, Heading, CodeBlock, Image]] = Field(description="Content elements in this section")
order: int = Field(description="Section order in document")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Section metadata")
class StructuredDocument(BaseModel):
"""Complete structured document in JSON format."""
metadata: DocumentMetadata = Field(description="Document metadata")
sections: List[DocumentSection] = Field(description="Document sections")
summary: Optional[str] = Field(default=None, description="Document summary")
tags: List[str] = Field(default_factory=list, description="Document tags")
def getSectionsByType(self, contentType: str) -> List[DocumentSection]:
"""Get all sections of a specific content type."""
return [section for section in self.sections if section.contentType == contentType]
def getAllTables(self) -> List[TableData]:
"""Get all table data from the document."""
tables = []
for section in self.sections:
for element in section.elements:
if isinstance(element, TableData):
tables.append(element)
return tables
def getAllLists(self) -> List[BulletList]:
"""Get all lists from the document."""
lists = []
for section in self.sections:
for element in section.elements:
if isinstance(element, BulletList):
lists.append(element)
return lists
# Update forward references
ListItem.model_rebuild()