123 lines
5.5 KiB
Python
123 lines
5.5 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
from typing import Any, Dict, List, Optional, Literal, Union
|
|
from pydantic import BaseModel, Field
|
|
from datetime import datetime
|
|
|
|
|
|
class DocumentMetadata(BaseModel):
|
|
"""Metadata for the entire document."""
|
|
title: str = Field(description="Document title")
|
|
author: Optional[str] = Field(default=None, description="Document author")
|
|
createdAt: datetime = Field(default_factory=datetime.now, description="Creation timestamp")
|
|
sourceDocuments: List[str] = Field(default_factory=list, description="Source document IDs")
|
|
extractionMethod: str = Field(default="ai_extraction", description="Method used for extraction")
|
|
version: str = Field(default="1.0", description="Document version")
|
|
|
|
|
|
class TableData(BaseModel):
|
|
"""Structured table data."""
|
|
headers: List[str] = Field(description="Table column headers")
|
|
rows: List[List[str]] = Field(description="Table data rows")
|
|
caption: Optional[str] = Field(default=None, description="Table caption")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Table metadata")
|
|
|
|
|
|
class ListItem(BaseModel):
|
|
"""Individual list item with optional sub-items."""
|
|
text: str = Field(description="List item text")
|
|
subitems: Optional[List['ListItem']] = Field(default=None, description="Nested sub-items")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Item metadata")
|
|
|
|
|
|
class BulletList(BaseModel):
|
|
"""Bulleted or numbered list."""
|
|
items: List[ListItem] = Field(description="List items")
|
|
listType: Literal["bullet", "numbered", "checklist"] = Field(default="bullet", description="List type")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="List metadata")
|
|
|
|
|
|
class Paragraph(BaseModel):
|
|
"""Text paragraph with optional formatting."""
|
|
text: str = Field(description="Paragraph text")
|
|
formatting: Optional[Dict[str, Any]] = Field(default=None, description="Text formatting (bold, italic, etc.)")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Paragraph metadata")
|
|
|
|
|
|
class Heading(BaseModel):
|
|
"""Document heading."""
|
|
text: str = Field(description="Heading text")
|
|
level: int = Field(ge=1, le=6, description="Heading level (1-6)")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Heading metadata")
|
|
|
|
|
|
class CodeBlock(BaseModel):
|
|
"""Code block with syntax highlighting."""
|
|
code: str = Field(description="Code content")
|
|
language: Optional[str] = Field(default=None, description="Programming language")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Code block metadata")
|
|
|
|
|
|
class Image(BaseModel):
|
|
"""Image with metadata."""
|
|
data: str = Field(description="Base64 encoded image data")
|
|
altText: Optional[str] = Field(default=None, description="Alternative text")
|
|
caption: Optional[str] = Field(default=None, description="Image caption")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Image metadata")
|
|
|
|
|
|
class DocumentSection(BaseModel):
|
|
"""A section of the document containing one or more content elements."""
|
|
id: str = Field(description="Unique section identifier")
|
|
title: Optional[str] = Field(default=None, description="Section title")
|
|
contentType: Literal["table", "list", "paragraph", "heading", "code", "image", "mixed"] = Field(description="Primary content type")
|
|
elements: List[Union[TableData, BulletList, Paragraph, Heading, CodeBlock, Image]] = Field(description="Content elements in this section")
|
|
order: int = Field(description="Section order in document")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Section metadata")
|
|
|
|
|
|
class StructuredDocument(BaseModel):
|
|
"""Complete structured document in JSON format."""
|
|
metadata: DocumentMetadata = Field(description="Document metadata")
|
|
sections: List[DocumentSection] = Field(description="Document sections")
|
|
summary: Optional[str] = Field(default=None, description="Document summary")
|
|
tags: List[str] = Field(default_factory=list, description="Document tags")
|
|
|
|
def getSectionsByType(self, contentType: str) -> List[DocumentSection]:
|
|
"""Get all sections of a specific content type."""
|
|
return [section for section in self.sections if section.contentType == contentType]
|
|
|
|
def getAllTables(self) -> List[TableData]:
|
|
"""Get all table data from the document."""
|
|
tables = []
|
|
for section in self.sections:
|
|
for element in section.elements:
|
|
if isinstance(element, TableData):
|
|
tables.append(element)
|
|
return tables
|
|
|
|
def getAllLists(self) -> List[BulletList]:
|
|
"""Get all lists from the document."""
|
|
lists = []
|
|
for section in self.sections:
|
|
for element in section.elements:
|
|
if isinstance(element, BulletList):
|
|
lists.append(element)
|
|
return lists
|
|
|
|
|
|
|
|
class RenderedDocument(BaseModel):
|
|
"""A single rendered document from a renderer."""
|
|
documentData: bytes = Field(description="Document content as bytes")
|
|
mimeType: str = Field(description="MIME type of the document (e.g., 'text/html', 'application/pdf')")
|
|
filename: str = Field(description="Filename for the document (e.g., 'report.html', 'image.png')")
|
|
|
|
class Config:
|
|
json_encoders = {
|
|
bytes: lambda v: v.decode('utf-8', errors='replace') if isinstance(v, bytes) else v
|
|
}
|
|
|
|
|
|
# Update forward references
|
|
ListItem.model_rebuild()
|