cleanup pydantic v2, unnecessary pdantic to dict convesrions, unnecessary unions removed with clean classes

2025-10-24 22:46:05 +02:00 · 2025-10-24 22:46:05 +02:00 · 8523da7fe2
commit 8523da7fe2
parent 4f7bba5f33
15 changed files with 188 additions and 319 deletions
--- a/modules/aicore/aicorePluginAnthropic.py
+++ b/modules/aicore/aicorePluginAnthropic.py
@ -50,7 +50,7 @@ class AiAnthropic(BaseConnectorAi):
                connectorType="anthropic",
                apiUrl="https://api.anthropic.com/v1/messages",
                temperature=0.2,
-                maxTokens=200000,
+                maxTokens=8192,
                contextLength=200000,
                costPer1kTokensInput=0.015,
                costPer1kTokensOutput=0.075,
@ -75,7 +75,7 @@ class AiAnthropic(BaseConnectorAi):
                connectorType="anthropic",
                apiUrl="https://api.anthropic.com/v1/messages",
                temperature=0.2,
-                maxTokens=200000,
+                maxTokens=8192,
                contextLength=200000,
                costPer1kTokensInput=0.015,
                costPer1kTokensOutput=0.075,
--- a/modules/connectors/connectorDbPostgre.py
+++ b/modules/connectors/connectorDbPostgre.py
@ -720,22 +720,9 @@ class DatabaseConnector:
                    logger.info(f"Initial ID {initialId} for table {table} registered")
                return success
            else:
-                # Check if the existing initial ID still exists in the table
-                existingInitialId = systemData[table]
-                records = self.getRecordset(
-                    model_class, recordFilter={"id": existingInitialId}
-                )
-                if not records:
-                    # The initial record no longer exists, update to the new one
-                    systemData[table] = initialId
-                    success = self._saveSystemTable(systemData)
-                    if success:
-                        logger.info(
-                            f"Initial ID updated from {existingInitialId} to {initialId} for table {table}"
-                        )
-                    return success
-                else:
-                    return True
+                # Table already has an initial ID registered
+                logger.debug(f"Table {table} already has initial ID {systemData[table]}")
+                return True
        except Exception as e:
            logger.error(f"Error registering the initial ID for table {table}: {e}")
            return False
--- a/modules/datamodels/datamodelAi.py
+++ b/modules/datamodels/datamodelAi.py
@ -135,7 +135,6 @@ class AiCallOptions(BaseModel):
    compressPrompt: bool = Field(default=True, description="Whether to compress the prompt")
    compressContext: bool = Field(default=True, description="If False: process each chunk; If True: summarize and work on summary")
    processDocumentsIndividually: bool = Field(default=True, description="If True, process each document separately; else pool docs")
-    maxContextBytes: Optional[int] = Field(default=None, description="Hard cap for extracted context size passed to the model")
    maxCost: Optional[float] = Field(default=None, description="Max cost budget")
    maxProcessingTime: Optional[int] = Field(default=None, description="Max processing time in seconds")
    processingMode: ProcessingModeEnum = Field(default=ProcessingModeEnum.BASIC, description="Processing mode")
@ -145,7 +144,6 @@ class AiCallOptions(BaseModel):
    
    # Model generation parameters
    temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0, description="Temperature for response generation (0.0-2.0, lower = more consistent)")
-    maxTokens: Optional[int] = Field(default=None, ge=1, le=32000, description="Maximum tokens in response")
    maxParts: Optional[int] = Field(default=1000, ge=1, le=1000, description="Maximum number of continuation parts to fetch")
    

@ -170,32 +168,6 @@ class AiCallResponse(BaseModel):
    errorCount: int = Field(default=0, description="0 for success, 1+ for errors")


-class EnhancedAiCallOptions(AiCallOptions):
-    """Enhanced options for improved document processing with chunk mapping."""
-    
-    # Parallel processing
-    enableParallelProcessing: bool = Field(
-        default=True, 
-        description="Enable parallel processing of chunks"
-    )
-    maxConcurrentChunks: int = Field(
-        default=5, 
-        ge=1, 
-        le=20, 
-        description="Maximum number of chunks to process concurrently"
-    )
-    
-    # Chunk mapping
-    preserveChunkMetadata: bool = Field(
-        default=True, 
-        description="Preserve chunk metadata during processing"
-    )
-    chunkSeparator: str = Field(
-        default="\n\n---\n\n", 
-        description="Separator between chunks in merged output"
-    )
-
-
 class AiModelCall(BaseModel):
    """Standardized input for AI model calls."""
    
--- a/modules/datamodels/datamodelExtraction.py
+++ b/modules/datamodels/datamodelExtraction.py
@ -1,6 +1,9 @@
-from typing import Any, Dict, List, Optional, Literal
+from typing import Any, Dict, List, Optional, Literal, TYPE_CHECKING
 from pydantic import BaseModel, Field

+if TYPE_CHECKING:
+    from modules.datamodels.datamodelAi import OperationTypeEnum
+

 class ContentPart(BaseModel):
    id: str = Field(description="Unique content part identifier")
@ -40,106 +43,49 @@ class PartResult(BaseModel):

 class MergeStrategy(BaseModel):
    """Strategy configuration for merging content parts and AI results."""
+    groupBy: str = Field(default="typeGroup", description="Field to group parts by (typeGroup, parentId, label, etc.)")
+    orderBy: str = Field(default="id", description="Field to order parts within groups (id, order, pageIndex, etc.)")
+    mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(default="concatenate", description="How to merge content within groups")
+    maxSize: Optional[int] = Field(default=None, description="Maximum size for merged content in bytes")
+    textMerge: Optional[Dict[str, Any]] = Field(default=None, description="Text-specific merge settings (separator, formatting, etc.)")
+    tableMerge: Optional[Dict[str, Any]] = Field(default=None, description="Table-specific merge settings (header handling, etc.)")
+    structureMerge: Optional[Dict[str, Any]] = Field(default=None, description="Structure-specific merge settings (hierarchy, etc.)")
+    aiResultMerge: Optional[Dict[str, Any]] = Field(default=None, description="AI result merging settings (prompt, context, etc.)")
+    preserveChunks: bool = Field(default=False, description="Whether to preserve individual chunks or merge them")
+    chunkSeparator: str = Field(default="\n\n---\n\n", description="Separator between chunks when merging")
+    preserveMetadata: bool = Field(default=True, description="Whether to preserve metadata from original parts")
+    metadataFields: Optional[List[str]] = Field(default=None, description="Specific metadata fields to preserve (None = all)")
+    onError: Literal["skip", "include", "fail"] = Field(default="skip", description="How to handle errors during merging")
+    validateContent: bool = Field(default=True, description="Whether to validate content before merging")
+    useIntelligentMerging: bool = Field(default=False, description="Whether to use intelligent token-aware merging")
+    prompt: Optional[str] = Field(default=None, description="Prompt for intelligent merging")
+    capabilities: Optional[Dict[str, Any]] = Field(default=None, description="Model capabilities for intelligent merging")
    
-    # Grouping configuration
-    groupBy: str = Field(
-        default="typeGroup",
-        description="Field to group parts by (typeGroup, parentId, label, etc.)"
-    )

-    # Ordering configuration
-    orderBy: str = Field(
-        default="id",
-        description="Field to order parts within groups (id, order, pageIndex, etc.)"
-    )
+class ExtractionOptions(BaseModel):
+    """Options for document extraction and processing with clear data structures."""
    
-    # Merge behavior
-    mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(
-        default="concatenate",
-        description="How to merge content within groups"
-    )
+    # Core extraction parameters
+    prompt: str = Field(description="Extraction prompt for AI processing")
+    operationType: 'OperationTypeEnum' = Field(description="Type of operation for AI processing")
+    processDocumentsIndividually: bool = Field(default=True, description="Process each document separately")
    
-    # Size limits
-    maxSize: Optional[int] = Field(
-        default=None,
-        description="Maximum size for merged content in bytes"
-    )
+    # Image processing parameters
+    imageMaxPixels: int = Field(default=1024 * 1024, ge=1, description="Maximum pixels for image processing")
+    imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
    
-    # Type-specific merge settings
-    textMerge: Optional[Dict[str, Any]] = Field(
-        default=None,
-        description="Text-specific merge settings (separator, formatting, etc.)"
-    )
+    # Merging strategy
+    mergeStrategy: MergeStrategy = Field(description="Strategy for merging extraction results")
    
-    tableMerge: Optional[Dict[str, Any]] = Field(
-        default=None,
-        description="Table-specific merge settings (header handling, etc.)"
-    )
+    # Optional chunking parameters (for backward compatibility)
+    chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")
+    maxSize: Optional[int] = Field(default=None, description="Maximum size for processing")
+    textChunkSize: Optional[int] = Field(default=None, description="Size for text chunks")
+    imageChunkSize: Optional[int] = Field(default=None, description="Size for image chunks")
    
-    structureMerge: Optional[Dict[str, Any]] = Field(
-        default=None,
-        description="Structure-specific merge settings (hierarchy, etc.)"
-    )
-    
-    # AI result merging
-    aiResultMerge: Optional[Dict[str, Any]] = Field(
-        default=None,
-        description="AI result merging settings (prompt, context, etc.)"
-    )
-    
-    # Chunk handling
-    preserveChunks: bool = Field(
-        default=False,
-        description="Whether to preserve individual chunks or merge them"
-    )
-    
-    chunkSeparator: str = Field(
-        default="\n\n---\n\n",
-        description="Separator between chunks when merging"
-    )
-    
-    # Metadata handling
-    preserveMetadata: bool = Field(
-        default=True,
-        description="Whether to preserve metadata from original parts"
-    )
-    
-    metadataFields: Optional[List[str]] = Field(
-        default=None,
-        description="Specific metadata fields to preserve (None = all)"
-    )
-    
-    # Error handling
-    onError: Literal["skip", "include", "fail"] = Field(
-        default="skip",
-        description="How to handle errors during merging"
-    )
-    
-    # Validation
-    validateContent: bool = Field(
-        default=True,
-        description="Whether to validate content before merging"
-    )
-    
-    def getTypeSpecificSettings(self, typeGroup: str) -> Dict[str, Any]:
-        """Get type-specific merge settings for a content type."""
-        if typeGroup == "text" and self.textMerge:
-            return self.textMerge
-        elif typeGroup == "table" and self.tableMerge:
-            return self.tableMerge
-        elif typeGroup == "structure" and self.structureMerge:
-            return self.structureMerge
-        else:
-            return {}
-    
-    def shouldPreserveChunk(self, chunk: Dict[str, Any]) -> bool:
-        """Determine if a chunk should be preserved based on strategy."""
-        if not self.preserveChunks:
-            return False
-        
-        # Check if chunk has error metadata
-        if self.onError == "skip" and chunk.get("metadata", {}).get("error"):
-            return False
-        
-        return True
+    # Additional processing options
+    enableParallelProcessing: bool = Field(default=True, description="Enable parallel processing of chunks")
+    maxConcurrentChunks: int = Field(default=5, ge=1, le=20, description="Maximum number of chunks to process concurrently")
    
+    class Config:
+        arbitraryTypesAllowed = True  # Allow OperationTypeEnum import
--- a/modules/interfaces/interfaceAiObjects.py
+++ b/modules/interfaces/interfaceAiObjects.py
@ -127,7 +127,7 @@ class AiObjects:
                logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
                
                # Call the model
-                response = await self._callWithModel(model, prompt, context)
+                response = await self._callWithModel(model, prompt, context, options)
                
                logger.info(f"✅ AI call successful with model: {model.name}")
                return response
@ -204,7 +204,7 @@ class AiObjects:
                
                if partSize <= modelContextBytes:
                    # Part fits - call AI directly
-                    response = await self._callWithModel(model, prompt, contentPart.data)
+                    response = await self._callWithModel(model, prompt, contentPart.data, options)
                    logger.info(f"✅ Content part processed successfully with model: {model.name}")
                    return response
                else:
@ -216,7 +216,7 @@ class AiObjects:
                    # Process each chunk
                    chunkResults = []
                    for chunk in chunks:
-                        chunkResponse = await self._callWithModel(model, prompt, chunk['data'])
+                        chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
                        chunkResults.append(chunkResponse)
                    
                    # Merge chunk results
@ -393,7 +393,7 @@ class AiObjects:
            errorCount=1
        )

-    async def _callWithModel(self, model: AiModel, prompt: str, context: str) -> AiCallResponse:
+    async def _callWithModel(self, model: AiModel, prompt: str, context: str, options: AiCallOptions = None) -> AiCallResponse:
        """Call a specific model and return the response."""
        # Calculate input bytes from prompt and context
        inputBytes = len((prompt + context).encode('utf-8'))
@ -430,7 +430,8 @@ class AiObjects:
            # Create standardized call object
            modelCall = AiModelCall(
                messages=messages,
-                model=model
+                model=model,
+                options=options or {}
            )
            
            # Call the model with standardized interface
--- a/modules/interfaces/interfaceDbChatObjects.py
+++ b/modules/interfaces/interfaceDbChatObjects.py
@ -873,7 +873,7 @@ class ChatObjects:
            stat = ChatStat(**statData)
            
            # Create the stat record in the database
-            created = self.db.recordCreate(ChatStat, stat.model_dump())
+            created = self.db.recordCreate(ChatStat, stat)
            
            # Return the created ChatStat
            return ChatStat(**created)
@ -937,7 +937,7 @@ class ChatObjects:
            items.append({
                "type": "message",
                "createdAt": msg_timestamp,
-                "item": chat_message.model_dump()
+                "item": chat_message
            })
        
        # Get logs
@ -952,7 +952,7 @@ class ChatObjects:
            items.append({
                "type": "log",
                "createdAt": log_timestamp,
-                "item": chat_log.model_dump()
+                "item": chat_log
            })
        
        # Get stats list
@ -966,7 +966,7 @@ class ChatObjects:
            items.append({
                "type": "stat",
                "createdAt": stat_timestamp,
-                "item": stat.model_dump()
+                "item": stat
            })
        
        # Sort all items by createdAt timestamp for chronological order
--- a/modules/routes/routeDataPrompts.py
+++ b/modules/routes/routeDataPrompts.py
@ -48,11 +48,8 @@ async def create_prompt(
    """Create a new prompt"""
    managementInterface = interfaceDbComponentObjects.getInterface(currentUser)
    
-    # Convert Prompt to dict for interface
-    prompt_data = prompt.model_dump()
-    
    # Create prompt
-    newPrompt = managementInterface.createPrompt(prompt_data)
+    newPrompt = managementInterface.createPrompt(prompt)
    
    return Prompt(**newPrompt)

--- a/modules/routes/routeDataUsers.py
+++ b/modules/routes/routeDataUsers.py
@ -92,11 +92,8 @@ async def create_user(
    """Create a new user"""
    appInterface = interfaceDbAppObjects.getInterface(currentUser)
    
-    # Convert User to dict for interface
-    user_dict = user_data.model_dump()
-    
    # Create user
-    newUser = appInterface.createUser(user_dict)
+    newUser = appInterface.createUser(user_data)
    
    return newUser

@ -119,11 +116,8 @@ async def update_user(
            detail=f"User with ID {userId} not found"
        )
    
-    # Convert User to dict for interface
-    update_data = userData.model_dump()
-    
    # Update user
-    updatedUser = appInterface.updateUser(userId, update_data)
+    updatedUser = appInterface.updateUser(userId, userData)
    
    if not updatedUser:
        raise HTTPException(
--- a/modules/services/serviceAi/subDocumentProcessing.py
+++ b/modules/services/serviceAi/subDocumentProcessing.py
@ -5,7 +5,7 @@ import time
 from typing import Dict, Any, List, Optional, Tuple, Union
 from modules.datamodels.datamodelChat import ChatDocument
 from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
-from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult
+from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult, ExtractionOptions, MergeStrategy
 from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService

 logger = logging.getLogger(__name__)
@ -33,19 +33,6 @@ class SubDocumentProcessing:
            self._extractionService = ExtractionService(self.services)
        return self._extractionService

-    def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int:
-        """Calculate maximum context bytes based on model capabilities and options."""
-        if options and options.maxContextBytes:
-            return options.maxContextBytes
-        
-        # Default model capabilities (this should be enhanced with actual model registry)
-        defaultMaxTokens = 4000
-        safetyMargin = options.safetyMargin if options else 0.1
-        
-        # Calculate bytes (4 chars per token estimation)
-        maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4)
-        
-        return maxContextBytes

    async def processDocumentsPerChunk(
        self,
@ -68,22 +55,23 @@ class SubDocumentProcessing:
        if not documents:
            return ""
        
-        # Build extraction options WITHOUT chunking parameters
-        extractionOptions: Dict[str, Any] = {
-            "prompt": prompt,
-            "operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
-            "processDocumentsIndividually": True,
-            # REMOVED: maxSize, textChunkSize, imageChunkSize
-            "mergeStrategy": {
-                "useIntelligentMerging": True,
-                "prompt": prompt,
-                "groupBy": "typeGroup",
-                "orderBy": "id",
-                "mergeType": "concatenate"
-            },
-        }
+        # Build extraction options using Pydantic model
+        mergeStrategy = MergeStrategy(
+            useIntelligentMerging=True,
+            prompt=prompt,
+            groupBy="typeGroup",
+            orderBy="id",
+            mergeType="concatenate"
+        )
        
-        logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
+        extractionOptions = ExtractionOptions(
+            prompt=prompt,
+            operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
+            processDocumentsIndividually=True,
+            mergeStrategy=mergeStrategy
+        )
+        
+        logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
        
        try:
            # Extract content WITHOUT chunking
@ -120,21 +108,23 @@ class SubDocumentProcessing:
        if not documents:
            return {"metadata": {"title": "Empty Document"}, "sections": []}
        
-        # Build extraction options WITHOUT chunking parameters
-        extractionOptions: Dict[str, Any] = {
-            "prompt": prompt,
-            "operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
-            "processDocumentsIndividually": True,
-            "mergeStrategy": {
-                "useIntelligentMerging": True,
-                "prompt": prompt,
-                "groupBy": "typeGroup",
-                "orderBy": "id",
-                "mergeType": "concatenate"
-            },
-        }
+        # Build extraction options using Pydantic model
+        mergeStrategy = MergeStrategy(
+            useIntelligentMerging=True,
+            prompt=prompt,
+            groupBy="typeGroup",
+            orderBy="id",
+            mergeType="concatenate"
+        )
        
-        logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
+        extractionOptions = ExtractionOptions(
+            prompt=prompt,
+            operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
+            processDocumentsIndividually=True,
+            mergeStrategy=mergeStrategy
+        )
+        
+        logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
        
        try:
            # Extract content WITHOUT chunking
@ -205,31 +195,25 @@ class SubDocumentProcessing:
        if not documents:
            return {"metadata": {"title": "Empty Document"}, "sections": []}
        
-        # Get model capabilities for size calculation
-        model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options)
+        # Build extraction options using Pydantic model (model-aware chunking in AI call phase)
+        mergeStrategy = MergeStrategy(
+            useIntelligentMerging=True,
+            prompt=custom_prompt,
+            groupBy="typeGroup",
+            orderBy="id",
+            mergeType="concatenate"
+        )
        
-        # Build extraction options for chunking with intelligent merging
-        extractionOptions: Dict[str, Any] = {
-            "prompt": custom_prompt,  # Use the custom prompt instead of default
-            "operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
-            "processDocumentsIndividually": True,  # Process each document separately
-            "maxSize": model_capabilities["maxContextBytes"],
-            "chunkAllowed": True,
-            "textChunkSize": model_capabilities["textChunkSize"],
-            "imageChunkSize": model_capabilities["imageChunkSize"],
-            "imageMaxPixels": 1024 * 1024,
-            "imageQuality": 85,
-            "mergeStrategy": {
-                "useIntelligentMerging": True,  # Enable intelligent token-aware merging
-                "capabilities": model_capabilities,
-                "prompt": custom_prompt,  # Use the custom prompt
-                "groupBy": "typeGroup",
-                "orderBy": "id",
-                "mergeType": "concatenate"
-            },
-        }
+        extractionOptions = ExtractionOptions(
+            prompt=custom_prompt,  # Use the custom prompt instead of default
+            operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
+            processDocumentsIndividually=True,  # Process each document separately
+            imageMaxPixels=1024 * 1024,
+            imageQuality=85,
+            mergeStrategy=mergeStrategy
+        )
        
-        logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
+        logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
        
        try:
            # Extract content with chunking
@ -1042,15 +1026,13 @@ CONTINUATION INSTRUCTIONS:
            content_parts.append(content_part)
        
        # Use existing merging strategy from options
-        merge_strategy = {
-            "useIntelligentMerging": True,
-            "groupBy": "documentId",  # Group by document
-            "orderBy": "partIndex",   # Order by part index
-            "mergeType": "concatenate"
-        }
+        merge_strategy = MergeStrategy(
+            useIntelligentMerging=True,
+            groupBy="documentId",  # Group by document
+            orderBy="partIndex",   # Order by part index
+            mergeType="concatenate"
+        )
        
-        if options and hasattr(options, 'mergeStrategy'):
-            merge_strategy.update(options.mergeStrategy)
        
        # Apply existing merging logic using the sophisticated merging system
        from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1095,15 +1077,13 @@ CONTINUATION INSTRUCTIONS:
            content_parts.append(content_part)
        
        # Use existing merging strategy for JSON mode
-        merge_strategy = {
-            "useIntelligentMerging": True,
-            "groupBy": "documentId",  # Group by document
-            "orderBy": "partIndex",   # Order by part index
-            "mergeType": "concatenate"
-        }
+        merge_strategy = MergeStrategy(
+            useIntelligentMerging=True,
+            groupBy="documentId",  # Group by document
+            orderBy="partIndex",   # Order by part index
+            mergeType="concatenate"
+        )
        
-        if options and hasattr(options, 'mergeStrategy'):
-            merge_strategy.update(options.mergeStrategy)
        
        # Apply existing merging logic using the sophisticated merging system
        from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1234,15 +1214,13 @@ CONTINUATION INSTRUCTIONS:
            content_parts.append(content_part)
        
        # Use existing merging strategy from options
-        merge_strategy = {
-            "useIntelligentMerging": True,
-            "groupBy": "documentId",  # Group by document
-            "orderBy": "chunkIndex",  # Order by chunk index
-            "mergeType": "concatenate"
-        }
+        merge_strategy = MergeStrategy(
+            useIntelligentMerging=True,
+            groupBy="documentId",  # Group by document
+            orderBy="chunkIndex",  # Order by chunk index
+            mergeType="concatenate"
+        )
        
-        if options and hasattr(options, 'mergeStrategy'):
-            merge_strategy.update(options.mergeStrategy)
        
        # Apply existing merging logic using the sophisticated merging system
        from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1297,15 +1275,13 @@ CONTINUATION INSTRUCTIONS:
            content_parts.append(content_part)
        
        # Use existing merging strategy for clean mode
-        merge_strategy = {
-            "useIntelligentMerging": True,
-            "groupBy": "documentId",  # Group by document
-            "orderBy": "chunkIndex",  # Order by chunk index
-            "mergeType": "concatenate"
-        }
+        merge_strategy = MergeStrategy(
+            useIntelligentMerging=True,
+            groupBy="documentId",  # Group by document
+            orderBy="chunkIndex",  # Order by chunk index
+            mergeType="concatenate"
+        )
        
-        if options and hasattr(options, 'mergeStrategy'):
-            merge_strategy.update(options.mergeStrategy)
        
        # Apply existing merging logic using the sophisticated merging system
        from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1351,15 +1327,13 @@ CONTINUATION INSTRUCTIONS:
            content_parts.append(content_part)
        
        # Use existing merging strategy for JSON mode
-        merge_strategy = {
-            "useIntelligentMerging": True,
-            "groupBy": "documentId",  # Group by document
-            "orderBy": "chunkIndex",  # Order by chunk index
-            "mergeType": "concatenate"
-        }
+        merge_strategy = MergeStrategy(
+            useIntelligentMerging=True,
+            groupBy="documentId",  # Group by document
+            orderBy="chunkIndex",  # Order by chunk index
+            mergeType="concatenate"
+        )
        
-        if options and hasattr(options, 'mergeStrategy'):
-            merge_strategy.update(options.mergeStrategy)
        
        # Apply existing merging logic using the sophisticated merging system
        from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1455,5 +1429,3 @@ CONTINUATION INSTRUCTIONS:
        
        logger.info(f"Merged {len(chunkResults)} chunks using existing sophisticated merging system (JSON mode)")
        return merged_document
-
-# REMOVED: _getModelCapabilitiesForContent method - no longer needed with model-aware chunking
--- a/modules/services/serviceExtraction/mainServiceExtraction.py
+++ b/modules/services/serviceExtraction/mainServiceExtraction.py
@ -5,7 +5,7 @@ import time

 from .subRegistry import ExtractorRegistry, ChunkerRegistry
 from .subPipeline import runExtraction
-from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy
+from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions
 from modules.datamodels.datamodelChat import ChatDocument
 from modules.datamodels.datamodelAi import AiCallResponse
 from modules.aicore.aicoreModelRegistry import modelRegistry
@ -20,7 +20,7 @@ class ExtractionService:
        self._extractorRegistry = ExtractorRegistry()
        self._chunkerRegistry = ChunkerRegistry()

-    def extractContent(self, documents: List[ChatDocument], options: Dict[str, Any]) -> List[ContentExtracted]:
+    def extractContent(self, documents: List[ChatDocument], options: ExtractionOptions) -> List[ContentExtracted]:
        """
        Extract content from a list of ChatDocument objects.
        
@ -31,6 +31,7 @@ class ExtractionService:
        Returns:
            List of ContentExtracted objects, one per input document
        """
+        
        results: List[ContentExtracted] = []
        
        # Lazy import to avoid circular deps and heavy init at module import
--- a/modules/services/serviceExtraction/merging/mergerDefault.py
+++ b/modules/services/serviceExtraction/merging/mergerDefault.py
@ -1,9 +1,9 @@
 from typing import Any, Dict, List
-from modules.datamodels.datamodelExtraction import ContentPart
+from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy


 class DefaultMerger:
-    def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
+    def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
        """
        Default merger that passes through parts unchanged.
        Used for image, binary, metadata, container typeGroups.
--- a/modules/services/serviceExtraction/merging/mergerTable.py
+++ b/modules/services/serviceExtraction/merging/mergerTable.py
@ -1,10 +1,10 @@
 from typing import Any, Dict, List
-from modules.datamodels.datamodelExtraction import ContentPart
+from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
 from ..subUtils import makeId


 class TableMerger:
-    def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
+    def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
        """
        Merge table parts based on strategy.
        Strategy options:
@ -15,9 +15,9 @@ class TableMerger:
        if not parts:
            return parts
            
-        groupBy = strategy.get("groupBy", "parentId")
-        maxSize = strategy.get("maxSize", 0)
-        combineSheets = strategy.get("combineSheets", False)
+        groupBy = strategy.groupBy
+        maxSize = strategy.maxSize or 0
+        combineSheets = strategy.tableMerge.get("combineSheets", False) if strategy.tableMerge else False
        
        # Group parts
        groups = self._groupParts(parts, groupBy, combineSheets)
--- a/modules/services/serviceExtraction/merging/mergerText.py
+++ b/modules/services/serviceExtraction/merging/mergerText.py
@ -1,10 +1,10 @@
 from typing import Any, Dict, List
-from modules.datamodels.datamodelExtraction import ContentPart
+from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
 from ..subUtils import makeId


 class TextMerger:
-    def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
+    def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
        """
        Merge text parts based on strategy.
        Strategy options:
@ -15,9 +15,9 @@ class TextMerger:
        if not parts:
            return parts
            
-        groupBy = strategy.get("groupBy", "parentId")
-        orderBy = strategy.get("orderBy", "label")
-        maxSize = strategy.get("maxSize", 0)
+        groupBy = strategy.groupBy
+        orderBy = strategy.orderBy
+        maxSize = strategy.maxSize or 0
        
        # Group parts
        groups = self._groupParts(parts, groupBy)
--- a/modules/services/serviceExtraction/subPipeline.py
+++ b/modules/services/serviceExtraction/subPipeline.py
@ -1,8 +1,7 @@
-from typing import Any, Dict, List
+from typing import List
 import logging
-import os

-from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
+from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy
 from .subUtils import makeId
 from .subRegistry import ExtractorRegistry, ChunkerRegistry
 from .merging.mergerText import TextMerger
@ -13,13 +12,13 @@ from .subMerger import IntelligentTokenAwareMerger
 logger = logging.getLogger(__name__)


-def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List[ContentPart]:
+def _mergeParts(parts: List[ContentPart], mergeStrategy: MergeStrategy) -> List[ContentPart]:
    """Merge parts based on the provided strategy."""
    if not parts or not mergeStrategy:
        return parts
    
-    groupBy = mergeStrategy.get("groupBy", "typeGroup")
-    orderBy = mergeStrategy.get("orderBy", "id")
+    groupBy = mergeStrategy.groupBy
+    orderBy = mergeStrategy.orderBy
    
    # Group parts by the specified field
    groups = {}
@ -56,7 +55,8 @@ def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List
    return merged_parts


-def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: Dict[str, Any]) -> ContentExtracted:
+def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted:
+    
    extractor = extractorRegistry.resolve(mimeType, fileName)
    if extractor is None:
        # fallback: single binary part
@ -71,15 +71,14 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
        )
        return ContentExtracted(id=makeId(), parts=[part])

-    parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options})
+    parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType})
    
    # REMOVED: poolAndLimit(parts, chunkerRegistry, options)
    # REMOVED: Chunking logic - now handled in AI call phase
    
    # Apply merging strategy if provided (preserve existing logic)
-    mergeStrategy = options.get("mergeStrategy", {})
-    if mergeStrategy:
-        parts = _applyMerging(parts, mergeStrategy)
+    if options.mergeStrategy:
+        parts = _applyMerging(parts, options.mergeStrategy)
    
    return ContentExtracted(id=makeId(), parts=parts)

@ -87,17 +86,17 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
 # REMOVED: poolAndLimit function - chunking now handled in AI call phase


-def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
+def _applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
    """Apply merging strategy to parts with intelligent token-aware merging."""
    logger.debug(f"_applyMerging called with {len(parts)} parts")
    
    # Check if intelligent merging is enabled
-    if strategy.get("useIntelligentMerging", False):
-        model_capabilities = strategy.get("capabilities", {})
+    if strategy.useIntelligentMerging:
+        model_capabilities = strategy.capabilities or {}
        subMerger = IntelligentTokenAwareMerger(model_capabilities)
        
        # Use intelligent merging for all parts
-        merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
+        merged = subMerger.merge_chunks_intelligently(parts, strategy.prompt or "")
        
        # Calculate and log optimization stats
        stats = subMerger.calculate_optimization_stats(parts, merged)
--- a/test_ai_model_selection.py
+++ b/test_ai_model_selection.py
@ -90,7 +90,7 @@ class ModelSelectionTester:
            totalScore = sizeRating + processingModeRating + priorityRating
            
            print(
-                f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
+                f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
            )
            print(f"      Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")

@ -136,7 +136,7 @@ class ModelSelectionTester:
            totalScore = sizeRating + processingModeRating + priorityRating
            
            print(
-                f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
+                f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
            )
            print(f"      Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")

@ -365,8 +365,8 @@ class ModelSelectionTester:
            )
            
            if failoverModelList:
-                print(f"    Selected model: {failoverModelList[0].name}")
-                print(f"    Fallback models: {[m.name for m in failoverModelList[1:3]]}")
+                print(f"    Selected model: {failoverModelList[0].displayName}")
+                print(f"    Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
            else:
                print("    No suitable models found")
                
@ -393,8 +393,8 @@ class ModelSelectionTester:
            )
            
            if failoverModelList:
-                print(f"    Selected model: {failoverModelList[0].name}")
-                print(f"    Fallback models: {[m.name for m in failoverModelList[1:3]]}")
+                print(f"    Selected model: {failoverModelList[0].displayName}")
+                print(f"    Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
            else:
                print("    No suitable models found")
                
@ -421,8 +421,8 @@ class ModelSelectionTester:
            )
            
            if failoverModelList:
-                print(f"    Selected model: {failoverModelList[0].name}")
-                print(f"    Fallback models: {[m.name for m in failoverModelList[1:3]]}")
+                print(f"    Selected model: {failoverModelList[0].displayName}")
+                print(f"    Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
            else:
                print("    No suitable models found")
                
@ -449,8 +449,8 @@ class ModelSelectionTester:
            )
            
            if failoverModelList:
-                print(f"    Selected model: {failoverModelList[0].name}")
-                print(f"    Fallback models: {[m.name for m in failoverModelList[1:3]]}")
+                print(f"    Selected model: {failoverModelList[0].displayName}")
+                print(f"    Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
            else:
                print("    No suitable models found")
                
@ -479,15 +479,15 @@ class ModelSelectionTester:
            print(f"  {connector_type}: {len(models)} models")
            for model in models:
                capabilities = getattr(model, 'capabilities', [])
-                print(f"    - {model.name}: {capabilities}")
+                print(f"    - {model.displayName}: {capabilities}")
        
        # Show operation type support
        print(f"\nOperation type support:")
        for op_type in OperationTypeEnum:
-            supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and op_type in m.operationTypes]
+            supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and any(ot.operationType == op_type for ot in m.operationTypes)]
            print(f"  {op_type.name}: {len(supported_models)} models")
            if supported_models:
-                model_names = [m.name for m in supported_models[:3]]  # Show first 3 models
+                model_names = [m.displayName for m in supported_models[:3]]  # Show first 3 models
                print(f"    Models: {', '.join(model_names)}")