# Copyright (c) 2025 Patrick Motsch # All rights reserved. from typing import Any, Dict, List, Optional, Literal, Union from pydantic import BaseModel, Field from datetime import datetime class DocumentMetadata(BaseModel): """Metadata for the entire document.""" title: str = Field(description="Document title") author: Optional[str] = Field(default=None, description="Document author") createdAt: datetime = Field(default_factory=datetime.now, description="Creation timestamp") sourceDocuments: List[str] = Field(default_factory=list, description="Source document IDs") extractionMethod: str = Field(default="ai_extraction", description="Method used for extraction") version: str = Field(default="1.0", description="Document version") documentType: Optional[str] = Field(default=None, description="Type of document (e.g., 'report', 'invoice', 'analysis')") styles: Optional[Dict[str, Any]] = Field(default=None, description="Document styling configuration") class TableData(BaseModel): """Structured table data.""" headers: List[str] = Field(description="Table column headers") rows: List[List[str]] = Field(description="Table data rows") caption: Optional[str] = Field(default=None, description="Table caption") metadata: Dict[str, Any] = Field(default_factory=dict, description="Table metadata") class ListItem(BaseModel): """Individual list item with optional sub-items.""" text: str = Field(description="List item text") subitems: Optional[List['ListItem']] = Field(default=None, description="Nested sub-items") metadata: Dict[str, Any] = Field(default_factory=dict, description="Item metadata") class BulletList(BaseModel): """Bulleted or numbered list.""" items: List[ListItem] = Field(description="List items") listType: Literal["bullet", "numbered", "checklist"] = Field(default="bullet", description="List type") metadata: Dict[str, Any] = Field(default_factory=dict, description="List metadata") class Paragraph(BaseModel): """Text paragraph with optional formatting.""" text: str = Field(description="Paragraph text") formatting: Optional[Dict[str, Any]] = Field(default=None, description="Text formatting (bold, italic, etc.)") metadata: Dict[str, Any] = Field(default_factory=dict, description="Paragraph metadata") class Heading(BaseModel): """Document heading.""" text: str = Field(description="Heading text") level: int = Field(ge=1, le=6, description="Heading level (1-6)") metadata: Dict[str, Any] = Field(default_factory=dict, description="Heading metadata") class CodeBlock(BaseModel): """Code block with syntax highlighting.""" code: str = Field(description="Code content") language: Optional[str] = Field(default=None, description="Programming language") metadata: Dict[str, Any] = Field(default_factory=dict, description="Code block metadata") class Image(BaseModel): """Image with metadata.""" data: str = Field(description="Base64 encoded image data") altText: Optional[str] = Field(default=None, description="Alternative text") caption: Optional[str] = Field(default=None, description="Image caption") metadata: Dict[str, Any] = Field(default_factory=dict, description="Image metadata") class DocumentSection(BaseModel): """A section of the document containing one or more content elements.""" id: str = Field(description="Unique section identifier") title: Optional[str] = Field(default=None, description="Section title") contentType: Literal["table", "list", "paragraph", "heading", "code", "image", "mixed"] = Field(description="Primary content type") elements: List[Union[TableData, BulletList, Paragraph, Heading, CodeBlock, Image]] = Field(description="Content elements in this section") order: int = Field(description="Section order in document") metadata: Dict[str, Any] = Field(default_factory=dict, description="Section metadata") class StructuredDocument(BaseModel): """Complete structured document in JSON format.""" metadata: DocumentMetadata = Field(description="Document metadata") sections: List[DocumentSection] = Field(description="Document sections") summary: Optional[str] = Field(default=None, description="Document summary") tags: List[str] = Field(default_factory=list, description="Document tags") def getSectionsByType(self, contentType: str) -> List[DocumentSection]: """Get all sections of a specific content type.""" return [section for section in self.sections if section.contentType == contentType] def getAllTables(self) -> List[TableData]: """Get all table data from the document.""" tables = [] for section in self.sections: for element in section.elements: if isinstance(element, TableData): tables.append(element) return tables def getAllLists(self) -> List[BulletList]: """Get all lists from the document.""" lists = [] for section in self.sections: for element in section.elements: if isinstance(element, BulletList): lists.append(element) return lists class RenderedDocument(BaseModel): """A single rendered document from a renderer.""" documentData: bytes = Field(description="Document content as bytes") mimeType: str = Field(description="MIME type of the document (e.g., 'text/html', 'application/pdf')") filename: str = Field(description="Filename for the document (e.g., 'report.html', 'image.png')") documentType: Optional[str] = Field(default=None, description="Type of document (e.g., 'report', 'invoice', 'analysis')") metadata: Optional[Dict[str, Any]] = Field(default=None, description="Document metadata (title, author, etc.)") class Config: json_encoders = { bytes: lambda v: v.decode('utf-8', errors='replace') if isinstance(v, bytes) else v } # Update forward references ListItem.model_rebuild()