# Copyright (c) 2025 Patrick Motsch # All rights reserved. from typing import Any, Dict, List, Optional, Literal from pydantic import BaseModel, Field class ContentPart(BaseModel): id: str = Field(description="Unique content part identifier") parentId: Optional[str] = Field(default=None, description="Optional parent content part id") label: str = Field(description="Human readable label of the part") typeGroup: str = Field(description="Logical type group: text, table, structure, binary, ...") mimeType: str = Field(description="MIME type of the part payload") data: str = Field(default="", description="Primary data payload, often extracted text") metadata: Dict[str, Any] = Field(default_factory=dict, description="Arbitrary metadata for the part") class ContentExtracted(BaseModel): id: str = Field(description="Extraction id or source document id") parts: List[ContentPart] = Field(default_factory=list, description="List of extracted parts") summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary") udm: Optional[Any] = Field(default=None, description="Optional UdmDocument (when outputFormat is udm or both)") class ChunkResult(BaseModel): """Preserves the relationship between a chunk and its AI result.""" originalChunk: ContentPart aiResult: str chunkIndex: int documentId: str processingTime: float = 0.0 metadata: Dict[str, Any] = Field(default_factory=dict) class PartResult(BaseModel): """Preserves the relationship between a content part and its AI result.""" originalPart: ContentPart aiResult: str partIndex: int documentId: str processingTime: float = 0.0 metadata: Dict[str, Any] = Field(default_factory=dict) class MergeStrategy(BaseModel): """Strategy configuration for merging content parts and AI results.""" groupBy: str = Field(default="typeGroup", description="Field to group parts by (typeGroup, parentId, label, etc.)") orderBy: str = Field(default="id", description="Field to order parts within groups (id, order, pageIndex, etc.)") mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(default="concatenate", description="How to merge content within groups") maxSize: Optional[int] = Field(default=None, description="Maximum size for merged content in bytes") textMerge: Optional[Dict[str, Any]] = Field(default=None, description="Text-specific merge settings (separator, formatting, etc.)") tableMerge: Optional[Dict[str, Any]] = Field(default=None, description="Table-specific merge settings (header handling, etc.)") structureMerge: Optional[Dict[str, Any]] = Field(default=None, description="Structure-specific merge settings (hierarchy, etc.)") aiResultMerge: Optional[Dict[str, Any]] = Field(default=None, description="AI result merging settings (prompt, context, etc.)") preserveChunks: bool = Field(default=False, description="Whether to preserve individual chunks or merge them") chunkSeparator: str = Field(default="\n\n---\n\n", description="Separator between chunks when merging") preserveMetadata: bool = Field(default=True, description="Whether to preserve metadata from original parts") metadataFields: Optional[List[str]] = Field(default=None, description="Specific metadata fields to preserve (None = all)") onError: Literal["skip", "include", "fail"] = Field(default="skip", description="How to handle errors during merging") validateContent: bool = Field(default=True, description="Whether to validate content before merging") useIntelligentMerging: bool = Field(default=False, description="Whether to use intelligent token-aware merging") prompt: Optional[str] = Field(default=None, description="Prompt for intelligent merging") capabilities: Optional[Dict[str, Any]] = Field(default=None, description="Model capabilities for intelligent merging") class DocumentIntent(BaseModel): """Intent-Analyse für ein einzelnes Dokument""" documentId: str = Field(description="ID des Dokuments") intents: List[str] = Field(description="Liste von Intents: ['extract', 'render', 'reference'] - mehrere möglich") extractionPrompt: Optional[str] = Field(default=None, description="Spezifischer Prompt für Extraktion (z.B. 'Extract text from images for legends')") reasoning: str = Field(description="Erklärung für Debugging/Transparenz: Warum wurde dieser Intent gewählt?") class ExtractionOptions(BaseModel): """Options for document extraction and processing with clear data structures.""" # Core extraction parameters prompt: str = Field(default="", description="Extraction prompt for AI processing") processDocumentsIndividually: bool = Field(default=True, description="Process each document separately") outputFormat: Literal["parts", "udm", "both"] = Field( default="parts", description="Return flat parts only, UDM tree only, or both (parts always populated; udm when udm or both)", ) outputDetail: Literal["full", "structure", "references"] = Field( default="full", description="Extraction detail: full inline data, skeleton without raw payloads, or file references only", ) lazyContainer: bool = Field( default=False, description="For archives: emit file entries with metadata only (no nested extraction)", ) # Image processing parameters imageMaxPixels: int = Field(default=1024 * 1024, ge=1, description="Maximum pixels for image processing") imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)") # Merging strategy mergeStrategy: MergeStrategy = Field(default_factory=MergeStrategy, description="Strategy for merging extraction results") # Optional chunking parameters (for backward compatibility) chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed") maxSize: Optional[int] = Field(default=None, description="Maximum size for processing") textChunkSize: Optional[int] = Field(default=None, description="Size for text chunks") imageChunkSize: Optional[int] = Field(default=None, description="Size for image chunks") # Additional processing options enableParallelProcessing: bool = Field(default=True, description="Enable parallel processing of chunks") maxConcurrentChunks: int = Field(default=5, ge=1, le=20, description="Maximum number of chunks to process concurrently")