94 lines
No EOL
5.7 KiB
Python
94 lines
No EOL
5.7 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
from typing import Any, Dict, List, Optional, Literal
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class ContentPart(BaseModel):
|
|
id: str = Field(description="Unique content part identifier")
|
|
parentId: Optional[str] = Field(default=None, description="Optional parent content part id")
|
|
label: str = Field(description="Human readable label of the part")
|
|
typeGroup: str = Field(description="Logical type group: text, table, structure, binary, ...")
|
|
mimeType: str = Field(description="MIME type of the part payload")
|
|
data: str = Field(default="", description="Primary data payload, often extracted text")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Arbitrary metadata for the part")
|
|
|
|
|
|
class ContentExtracted(BaseModel):
|
|
id: str = Field(description="Extraction id or source document id")
|
|
parts: List[ContentPart] = Field(default_factory=list, description="List of extracted parts")
|
|
summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary")
|
|
|
|
|
|
class ChunkResult(BaseModel):
|
|
"""Preserves the relationship between a chunk and its AI result."""
|
|
originalChunk: ContentPart
|
|
aiResult: str
|
|
chunkIndex: int
|
|
documentId: str
|
|
processingTime: float = 0.0
|
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
|
|
|
|
class PartResult(BaseModel):
|
|
"""Preserves the relationship between a content part and its AI result."""
|
|
originalPart: ContentPart
|
|
aiResult: str
|
|
partIndex: int
|
|
documentId: str
|
|
processingTime: float = 0.0
|
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
|
|
|
|
class MergeStrategy(BaseModel):
|
|
"""Strategy configuration for merging content parts and AI results."""
|
|
groupBy: str = Field(default="typeGroup", description="Field to group parts by (typeGroup, parentId, label, etc.)")
|
|
orderBy: str = Field(default="id", description="Field to order parts within groups (id, order, pageIndex, etc.)")
|
|
mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(default="concatenate", description="How to merge content within groups")
|
|
maxSize: Optional[int] = Field(default=None, description="Maximum size for merged content in bytes")
|
|
textMerge: Optional[Dict[str, Any]] = Field(default=None, description="Text-specific merge settings (separator, formatting, etc.)")
|
|
tableMerge: Optional[Dict[str, Any]] = Field(default=None, description="Table-specific merge settings (header handling, etc.)")
|
|
structureMerge: Optional[Dict[str, Any]] = Field(default=None, description="Structure-specific merge settings (hierarchy, etc.)")
|
|
aiResultMerge: Optional[Dict[str, Any]] = Field(default=None, description="AI result merging settings (prompt, context, etc.)")
|
|
preserveChunks: bool = Field(default=False, description="Whether to preserve individual chunks or merge them")
|
|
chunkSeparator: str = Field(default="\n\n---\n\n", description="Separator between chunks when merging")
|
|
preserveMetadata: bool = Field(default=True, description="Whether to preserve metadata from original parts")
|
|
metadataFields: Optional[List[str]] = Field(default=None, description="Specific metadata fields to preserve (None = all)")
|
|
onError: Literal["skip", "include", "fail"] = Field(default="skip", description="How to handle errors during merging")
|
|
validateContent: bool = Field(default=True, description="Whether to validate content before merging")
|
|
useIntelligentMerging: bool = Field(default=False, description="Whether to use intelligent token-aware merging")
|
|
prompt: Optional[str] = Field(default=None, description="Prompt for intelligent merging")
|
|
capabilities: Optional[Dict[str, Any]] = Field(default=None, description="Model capabilities for intelligent merging")
|
|
|
|
|
|
class DocumentIntent(BaseModel):
|
|
"""Intent-Analyse für ein einzelnes Dokument"""
|
|
documentId: str = Field(description="ID des Dokuments")
|
|
intents: List[str] = Field(description="Liste von Intents: ['extract', 'render', 'reference'] - mehrere möglich")
|
|
extractionPrompt: Optional[str] = Field(default=None, description="Spezifischer Prompt für Extraktion (z.B. 'Extract text from images for legends')")
|
|
reasoning: str = Field(description="Erklärung für Debugging/Transparenz: Warum wurde dieser Intent gewählt?")
|
|
|
|
|
|
class ExtractionOptions(BaseModel):
|
|
"""Options for document extraction and processing with clear data structures."""
|
|
|
|
# Core extraction parameters
|
|
prompt: str = Field(description="Extraction prompt for AI processing")
|
|
processDocumentsIndividually: bool = Field(default=True, description="Process each document separately")
|
|
|
|
# Image processing parameters
|
|
imageMaxPixels: int = Field(default=1024 * 1024, ge=1, description="Maximum pixels for image processing")
|
|
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
|
|
|
|
# Merging strategy
|
|
mergeStrategy: MergeStrategy = Field(description="Strategy for merging extraction results")
|
|
|
|
# Optional chunking parameters (for backward compatibility)
|
|
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")
|
|
maxSize: Optional[int] = Field(default=None, description="Maximum size for processing")
|
|
textChunkSize: Optional[int] = Field(default=None, description="Size for text chunks")
|
|
imageChunkSize: Optional[int] = Field(default=None, description="Size for image chunks")
|
|
|
|
# Additional processing options
|
|
enableParallelProcessing: bool = Field(default=True, description="Enable parallel processing of chunks")
|
|
maxConcurrentChunks: int = Field(default=5, ge=1, le=20, description="Maximum number of chunks to process concurrently") |