130 lines
6 KiB
Python
130 lines
6 KiB
Python
from typing import Any, Dict, List, Optional, Literal, Union
|
|
from pydantic import BaseModel, Field
|
|
from datetime import datetime
|
|
|
|
|
|
class DocumentMetadata(BaseModel):
|
|
"""Metadata for the entire document."""
|
|
title: str = Field(description="Document title")
|
|
author: Optional[str] = Field(default=None, description="Document author")
|
|
created_at: datetime = Field(default_factory=datetime.now, description="Creation timestamp")
|
|
source_documents: List[str] = Field(default_factory=list, description="Source document IDs")
|
|
extraction_method: str = Field(default="ai_extraction", description="Method used for extraction")
|
|
version: str = Field(default="1.0", description="Document version")
|
|
|
|
|
|
class TableData(BaseModel):
|
|
"""Structured table data."""
|
|
headers: List[str] = Field(description="Table column headers")
|
|
rows: List[List[str]] = Field(description="Table data rows")
|
|
caption: Optional[str] = Field(default=None, description="Table caption")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Table metadata")
|
|
|
|
|
|
class ListItem(BaseModel):
|
|
"""Individual list item with optional sub-items."""
|
|
text: str = Field(description="List item text")
|
|
subitems: Optional[List['ListItem']] = Field(default=None, description="Nested sub-items")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Item metadata")
|
|
|
|
|
|
class BulletList(BaseModel):
|
|
"""Bulleted or numbered list."""
|
|
items: List[ListItem] = Field(description="List items")
|
|
list_type: Literal["bullet", "numbered", "checklist"] = Field(default="bullet", description="List type")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="List metadata")
|
|
|
|
|
|
class Paragraph(BaseModel):
|
|
"""Text paragraph with optional formatting."""
|
|
text: str = Field(description="Paragraph text")
|
|
formatting: Optional[Dict[str, Any]] = Field(default=None, description="Text formatting (bold, italic, etc.)")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Paragraph metadata")
|
|
|
|
|
|
class Heading(BaseModel):
|
|
"""Document heading."""
|
|
text: str = Field(description="Heading text")
|
|
level: int = Field(ge=1, le=6, description="Heading level (1-6)")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Heading metadata")
|
|
|
|
|
|
class CodeBlock(BaseModel):
|
|
"""Code block with syntax highlighting."""
|
|
code: str = Field(description="Code content")
|
|
language: Optional[str] = Field(default=None, description="Programming language")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Code block metadata")
|
|
|
|
|
|
class Image(BaseModel):
|
|
"""Image with metadata."""
|
|
data: str = Field(description="Base64 encoded image data")
|
|
alt_text: Optional[str] = Field(default=None, description="Alternative text")
|
|
caption: Optional[str] = Field(default=None, description="Image caption")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Image metadata")
|
|
|
|
|
|
class DocumentSection(BaseModel):
|
|
"""A section of the document containing one or more content elements."""
|
|
id: str = Field(description="Unique section identifier")
|
|
title: Optional[str] = Field(default=None, description="Section title")
|
|
content_type: Literal["table", "list", "paragraph", "heading", "code", "image", "mixed"] = Field(description="Primary content type")
|
|
elements: List[Union[TableData, BulletList, Paragraph, Heading, CodeBlock, Image]] = Field(description="Content elements in this section")
|
|
order: int = Field(description="Section order in document")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Section metadata")
|
|
|
|
|
|
class StructuredDocument(BaseModel):
|
|
"""Complete structured document in JSON format."""
|
|
metadata: DocumentMetadata = Field(description="Document metadata")
|
|
sections: List[DocumentSection] = Field(description="Document sections")
|
|
summary: Optional[str] = Field(default=None, description="Document summary")
|
|
tags: List[str] = Field(default_factory=list, description="Document tags")
|
|
|
|
def get_sections_by_type(self, content_type: str) -> List[DocumentSection]:
|
|
"""Get all sections of a specific content type."""
|
|
return [section for section in self.sections if section.content_type == content_type]
|
|
|
|
def get_all_tables(self) -> List[TableData]:
|
|
"""Get all table data from the document."""
|
|
tables = []
|
|
for section in self.sections:
|
|
for element in section.elements:
|
|
if isinstance(element, TableData):
|
|
tables.append(element)
|
|
return tables
|
|
|
|
def get_all_lists(self) -> List[BulletList]:
|
|
"""Get all lists from the document."""
|
|
lists = []
|
|
for section in self.sections:
|
|
for element in section.elements:
|
|
if isinstance(element, BulletList):
|
|
lists.append(element)
|
|
return lists
|
|
|
|
|
|
class JsonChunkResult(BaseModel):
|
|
"""Result from processing a single chunk with JSON output."""
|
|
chunk_id: str = Field(description="Chunk identifier")
|
|
document_section: DocumentSection = Field(description="Structured content from this chunk")
|
|
processing_time: float = Field(description="Processing time in seconds")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Chunk processing metadata")
|
|
|
|
|
|
class JsonMergeResult(BaseModel):
|
|
"""Result from merging multiple JSON chunks."""
|
|
merged_document: StructuredDocument = Field(description="Merged structured document")
|
|
merge_strategy: str = Field(description="Strategy used for merging")
|
|
chunks_processed: int = Field(description="Number of chunks processed")
|
|
merge_time: float = Field(description="Time taken to merge chunks")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Merge process metadata")
|
|
|
|
|
|
# Update forward references (compatible with Pydantic v1 and v2)
|
|
try:
|
|
# Pydantic v2
|
|
ListItem.model_rebuild()
|
|
except AttributeError:
|
|
# Pydantic v1
|
|
ListItem.update_forward_refs()
|