from typing import Any, Dict, List, Optional, Literal from pydantic import BaseModel, Field class ContentPart(BaseModel): id: str = Field(description="Unique content part identifier") parentId: Optional[str] = Field(default=None, description="Optional parent content part id") label: str = Field(description="Human readable label of the part") typeGroup: str = Field(description="Logical type group: text, table, structure, binary, ...") mimeType: str = Field(description="MIME type of the part payload") data: str = Field(default="", description="Primary data payload, often extracted text") metadata: Dict[str, Any] = Field(default_factory=dict, description="Arbitrary metadata for the part") class ContentExtracted(BaseModel): id: str = Field(description="Extraction id or source document id") parts: List[ContentPart] = Field(default_factory=list, description="List of extracted parts") summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary") class ChunkResult(BaseModel): """Preserves the relationship between a chunk and its AI result.""" originalChunk: ContentPart aiResult: str chunkIndex: int documentId: str processingTime: float = 0.0 metadata: Dict[str, Any] = Field(default_factory=dict) class PartResult(BaseModel): """Preserves the relationship between a content part and its AI result.""" originalPart: ContentPart aiResult: str partIndex: int documentId: str processingTime: float = 0.0 metadata: Dict[str, Any] = Field(default_factory=dict) class MergeStrategy(BaseModel): """Strategy configuration for merging content parts and AI results.""" # Grouping configuration groupBy: str = Field( default="typeGroup", description="Field to group parts by (typeGroup, parentId, label, etc.)" ) # Ordering configuration orderBy: str = Field( default="id", description="Field to order parts within groups (id, order, pageIndex, etc.)" ) # Merge behavior mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field( default="concatenate", description="How to merge content within groups" ) # Size limits maxSize: Optional[int] = Field( default=None, description="Maximum size for merged content in bytes" ) # Type-specific merge settings textMerge: Optional[Dict[str, Any]] = Field( default=None, description="Text-specific merge settings (separator, formatting, etc.)" ) tableMerge: Optional[Dict[str, Any]] = Field( default=None, description="Table-specific merge settings (header handling, etc.)" ) structureMerge: Optional[Dict[str, Any]] = Field( default=None, description="Structure-specific merge settings (hierarchy, etc.)" ) # AI result merging aiResultMerge: Optional[Dict[str, Any]] = Field( default=None, description="AI result merging settings (prompt, context, etc.)" ) # Chunk handling preserveChunks: bool = Field( default=False, description="Whether to preserve individual chunks or merge them" ) chunkSeparator: str = Field( default="\n\n---\n\n", description="Separator between chunks when merging" ) # Metadata handling preserveMetadata: bool = Field( default=True, description="Whether to preserve metadata from original parts" ) metadataFields: Optional[List[str]] = Field( default=None, description="Specific metadata fields to preserve (None = all)" ) # Error handling onError: Literal["skip", "include", "fail"] = Field( default="skip", description="How to handle errors during merging" ) # Validation validateContent: bool = Field( default=True, description="Whether to validate content before merging" ) def getTypeSpecificSettings(self, typeGroup: str) -> Dict[str, Any]: """Get type-specific merge settings for a content type.""" if typeGroup == "text" and self.textMerge: return self.textMerge elif typeGroup == "table" and self.tableMerge: return self.tableMerge elif typeGroup == "structure" and self.structureMerge: return self.structureMerge else: return {} def shouldPreserveChunk(self, chunk: Dict[str, Any]) -> bool: """Determine if a chunk should be preserved based on strategy.""" if not self.preserveChunks: return False # Check if chunk has error metadata if self.onError == "skip" and chunk.get("metadata", {}).get("error"): return False return True