From 8523da7fe2b9e89f7232a8186aa1ef00e185c712 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Fri, 24 Oct 2025 22:46:05 +0200
Subject: [PATCH] cleanup pydantic v2, unnecessary pdantic to dict convesrions,
unnecessary unions removed with clean classes
---
modules/aicore/aicorePluginAnthropic.py | 4 +-
modules/connectors/connectorDbPostgre.py | 19 +-
modules/datamodels/datamodelAi.py | 28 ---
modules/datamodels/datamodelExtraction.py | 150 +++++---------
modules/interfaces/interfaceAiObjects.py | 11 +-
modules/interfaces/interfaceDbChatObjects.py | 8 +-
modules/routes/routeDataPrompts.py | 5 +-
modules/routes/routeDataUsers.py | 10 +-
.../serviceAi/subDocumentProcessing.py | 188 ++++++++----------
.../mainServiceExtraction.py | 5 +-
.../merging/mergerDefault.py | 4 +-
.../serviceExtraction/merging/mergerTable.py | 10 +-
.../serviceExtraction/merging/mergerText.py | 10 +-
.../services/serviceExtraction/subPipeline.py | 29 ++-
test_ai_model_selection.py | 26 +--
15 files changed, 188 insertions(+), 319 deletions(-)
diff --git a/modules/aicore/aicorePluginAnthropic.py b/modules/aicore/aicorePluginAnthropic.py
index 5f5e2bcb..83482d6f 100644
--- a/modules/aicore/aicorePluginAnthropic.py
+++ b/modules/aicore/aicorePluginAnthropic.py
@@ -50,7 +50,7 @@ class AiAnthropic(BaseConnectorAi):
connectorType="anthropic",
apiUrl="https://api.anthropic.com/v1/messages",
temperature=0.2,
- maxTokens=200000,
+ maxTokens=8192,
contextLength=200000,
costPer1kTokensInput=0.015,
costPer1kTokensOutput=0.075,
@@ -75,7 +75,7 @@ class AiAnthropic(BaseConnectorAi):
connectorType="anthropic",
apiUrl="https://api.anthropic.com/v1/messages",
temperature=0.2,
- maxTokens=200000,
+ maxTokens=8192,
contextLength=200000,
costPer1kTokensInput=0.015,
costPer1kTokensOutput=0.075,
diff --git a/modules/connectors/connectorDbPostgre.py b/modules/connectors/connectorDbPostgre.py
index e01e267b..77441399 100644
--- a/modules/connectors/connectorDbPostgre.py
+++ b/modules/connectors/connectorDbPostgre.py
@@ -720,22 +720,9 @@ class DatabaseConnector:
logger.info(f"Initial ID {initialId} for table {table} registered")
return success
else:
- # Check if the existing initial ID still exists in the table
- existingInitialId = systemData[table]
- records = self.getRecordset(
- model_class, recordFilter={"id": existingInitialId}
- )
- if not records:
- # The initial record no longer exists, update to the new one
- systemData[table] = initialId
- success = self._saveSystemTable(systemData)
- if success:
- logger.info(
- f"Initial ID updated from {existingInitialId} to {initialId} for table {table}"
- )
- return success
- else:
- return True
+ # Table already has an initial ID registered
+ logger.debug(f"Table {table} already has initial ID {systemData[table]}")
+ return True
except Exception as e:
logger.error(f"Error registering the initial ID for table {table}: {e}")
return False
diff --git a/modules/datamodels/datamodelAi.py b/modules/datamodels/datamodelAi.py
index 730c73cc..7643dc79 100644
--- a/modules/datamodels/datamodelAi.py
+++ b/modules/datamodels/datamodelAi.py
@@ -135,7 +135,6 @@ class AiCallOptions(BaseModel):
compressPrompt: bool = Field(default=True, description="Whether to compress the prompt")
compressContext: bool = Field(default=True, description="If False: process each chunk; If True: summarize and work on summary")
processDocumentsIndividually: bool = Field(default=True, description="If True, process each document separately; else pool docs")
- maxContextBytes: Optional[int] = Field(default=None, description="Hard cap for extracted context size passed to the model")
maxCost: Optional[float] = Field(default=None, description="Max cost budget")
maxProcessingTime: Optional[int] = Field(default=None, description="Max processing time in seconds")
processingMode: ProcessingModeEnum = Field(default=ProcessingModeEnum.BASIC, description="Processing mode")
@@ -145,7 +144,6 @@ class AiCallOptions(BaseModel):
# Model generation parameters
temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0, description="Temperature for response generation (0.0-2.0, lower = more consistent)")
- maxTokens: Optional[int] = Field(default=None, ge=1, le=32000, description="Maximum tokens in response")
maxParts: Optional[int] = Field(default=1000, ge=1, le=1000, description="Maximum number of continuation parts to fetch")
@@ -170,32 +168,6 @@ class AiCallResponse(BaseModel):
errorCount: int = Field(default=0, description="0 for success, 1+ for errors")
-class EnhancedAiCallOptions(AiCallOptions):
- """Enhanced options for improved document processing with chunk mapping."""
-
- # Parallel processing
- enableParallelProcessing: bool = Field(
- default=True,
- description="Enable parallel processing of chunks"
- )
- maxConcurrentChunks: int = Field(
- default=5,
- ge=1,
- le=20,
- description="Maximum number of chunks to process concurrently"
- )
-
- # Chunk mapping
- preserveChunkMetadata: bool = Field(
- default=True,
- description="Preserve chunk metadata during processing"
- )
- chunkSeparator: str = Field(
- default="\n\n---\n\n",
- description="Separator between chunks in merged output"
- )
-
-
class AiModelCall(BaseModel):
"""Standardized input for AI model calls."""
diff --git a/modules/datamodels/datamodelExtraction.py b/modules/datamodels/datamodelExtraction.py
index b0ba0f9b..242d413a 100644
--- a/modules/datamodels/datamodelExtraction.py
+++ b/modules/datamodels/datamodelExtraction.py
@@ -1,6 +1,9 @@
-from typing import Any, Dict, List, Optional, Literal
+from typing import Any, Dict, List, Optional, Literal, TYPE_CHECKING
from pydantic import BaseModel, Field
+if TYPE_CHECKING:
+ from modules.datamodels.datamodelAi import OperationTypeEnum
+
class ContentPart(BaseModel):
id: str = Field(description="Unique content part identifier")
@@ -40,106 +43,49 @@ class PartResult(BaseModel):
class MergeStrategy(BaseModel):
"""Strategy configuration for merging content parts and AI results."""
+ groupBy: str = Field(default="typeGroup", description="Field to group parts by (typeGroup, parentId, label, etc.)")
+ orderBy: str = Field(default="id", description="Field to order parts within groups (id, order, pageIndex, etc.)")
+ mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(default="concatenate", description="How to merge content within groups")
+ maxSize: Optional[int] = Field(default=None, description="Maximum size for merged content in bytes")
+ textMerge: Optional[Dict[str, Any]] = Field(default=None, description="Text-specific merge settings (separator, formatting, etc.)")
+ tableMerge: Optional[Dict[str, Any]] = Field(default=None, description="Table-specific merge settings (header handling, etc.)")
+ structureMerge: Optional[Dict[str, Any]] = Field(default=None, description="Structure-specific merge settings (hierarchy, etc.)")
+ aiResultMerge: Optional[Dict[str, Any]] = Field(default=None, description="AI result merging settings (prompt, context, etc.)")
+ preserveChunks: bool = Field(default=False, description="Whether to preserve individual chunks or merge them")
+ chunkSeparator: str = Field(default="\n\n---\n\n", description="Separator between chunks when merging")
+ preserveMetadata: bool = Field(default=True, description="Whether to preserve metadata from original parts")
+ metadataFields: Optional[List[str]] = Field(default=None, description="Specific metadata fields to preserve (None = all)")
+ onError: Literal["skip", "include", "fail"] = Field(default="skip", description="How to handle errors during merging")
+ validateContent: bool = Field(default=True, description="Whether to validate content before merging")
+ useIntelligentMerging: bool = Field(default=False, description="Whether to use intelligent token-aware merging")
+ prompt: Optional[str] = Field(default=None, description="Prompt for intelligent merging")
+ capabilities: Optional[Dict[str, Any]] = Field(default=None, description="Model capabilities for intelligent merging")
- # Grouping configuration
- groupBy: str = Field(
- default="typeGroup",
- description="Field to group parts by (typeGroup, parentId, label, etc.)"
- )
-
- # Ordering configuration
- orderBy: str = Field(
- default="id",
- description="Field to order parts within groups (id, order, pageIndex, etc.)"
- )
-
- # Merge behavior
- mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(
- default="concatenate",
- description="How to merge content within groups"
- )
-
- # Size limits
- maxSize: Optional[int] = Field(
- default=None,
- description="Maximum size for merged content in bytes"
- )
-
- # Type-specific merge settings
- textMerge: Optional[Dict[str, Any]] = Field(
- default=None,
- description="Text-specific merge settings (separator, formatting, etc.)"
- )
-
- tableMerge: Optional[Dict[str, Any]] = Field(
- default=None,
- description="Table-specific merge settings (header handling, etc.)"
- )
-
- structureMerge: Optional[Dict[str, Any]] = Field(
- default=None,
- description="Structure-specific merge settings (hierarchy, etc.)"
- )
-
- # AI result merging
- aiResultMerge: Optional[Dict[str, Any]] = Field(
- default=None,
- description="AI result merging settings (prompt, context, etc.)"
- )
-
- # Chunk handling
- preserveChunks: bool = Field(
- default=False,
- description="Whether to preserve individual chunks or merge them"
- )
-
- chunkSeparator: str = Field(
- default="\n\n---\n\n",
- description="Separator between chunks when merging"
- )
-
- # Metadata handling
- preserveMetadata: bool = Field(
- default=True,
- description="Whether to preserve metadata from original parts"
- )
-
- metadataFields: Optional[List[str]] = Field(
- default=None,
- description="Specific metadata fields to preserve (None = all)"
- )
-
- # Error handling
- onError: Literal["skip", "include", "fail"] = Field(
- default="skip",
- description="How to handle errors during merging"
- )
-
- # Validation
- validateContent: bool = Field(
- default=True,
- description="Whether to validate content before merging"
- )
-
- def getTypeSpecificSettings(self, typeGroup: str) -> Dict[str, Any]:
- """Get type-specific merge settings for a content type."""
- if typeGroup == "text" and self.textMerge:
- return self.textMerge
- elif typeGroup == "table" and self.tableMerge:
- return self.tableMerge
- elif typeGroup == "structure" and self.structureMerge:
- return self.structureMerge
- else:
- return {}
-
- def shouldPreserveChunk(self, chunk: Dict[str, Any]) -> bool:
- """Determine if a chunk should be preserved based on strategy."""
- if not self.preserveChunks:
- return False
-
- # Check if chunk has error metadata
- if self.onError == "skip" and chunk.get("metadata", {}).get("error"):
- return False
-
- return True
+class ExtractionOptions(BaseModel):
+ """Options for document extraction and processing with clear data structures."""
+
+ # Core extraction parameters
+ prompt: str = Field(description="Extraction prompt for AI processing")
+ operationType: 'OperationTypeEnum' = Field(description="Type of operation for AI processing")
+ processDocumentsIndividually: bool = Field(default=True, description="Process each document separately")
+
+ # Image processing parameters
+ imageMaxPixels: int = Field(default=1024 * 1024, ge=1, description="Maximum pixels for image processing")
+ imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
+
+ # Merging strategy
+ mergeStrategy: MergeStrategy = Field(description="Strategy for merging extraction results")
+
+ # Optional chunking parameters (for backward compatibility)
+ chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")
+ maxSize: Optional[int] = Field(default=None, description="Maximum size for processing")
+ textChunkSize: Optional[int] = Field(default=None, description="Size for text chunks")
+ imageChunkSize: Optional[int] = Field(default=None, description="Size for image chunks")
+
+ # Additional processing options
+ enableParallelProcessing: bool = Field(default=True, description="Enable parallel processing of chunks")
+ maxConcurrentChunks: int = Field(default=5, ge=1, le=20, description="Maximum number of chunks to process concurrently")
+
+ class Config:
+ arbitraryTypesAllowed = True # Allow OperationTypeEnum import
diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py
index a76e5e41..51fa90af 100644
--- a/modules/interfaces/interfaceAiObjects.py
+++ b/modules/interfaces/interfaceAiObjects.py
@@ -127,7 +127,7 @@ class AiObjects:
logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
# Call the model
- response = await self._callWithModel(model, prompt, context)
+ response = await self._callWithModel(model, prompt, context, options)
logger.info(f"✅ AI call successful with model: {model.name}")
return response
@@ -204,7 +204,7 @@ class AiObjects:
if partSize <= modelContextBytes:
# Part fits - call AI directly
- response = await self._callWithModel(model, prompt, contentPart.data)
+ response = await self._callWithModel(model, prompt, contentPart.data, options)
logger.info(f"✅ Content part processed successfully with model: {model.name}")
return response
else:
@@ -216,7 +216,7 @@ class AiObjects:
# Process each chunk
chunkResults = []
for chunk in chunks:
- chunkResponse = await self._callWithModel(model, prompt, chunk['data'])
+ chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
chunkResults.append(chunkResponse)
# Merge chunk results
@@ -393,7 +393,7 @@ class AiObjects:
errorCount=1
)
- async def _callWithModel(self, model: AiModel, prompt: str, context: str) -> AiCallResponse:
+ async def _callWithModel(self, model: AiModel, prompt: str, context: str, options: AiCallOptions = None) -> AiCallResponse:
"""Call a specific model and return the response."""
# Calculate input bytes from prompt and context
inputBytes = len((prompt + context).encode('utf-8'))
@@ -430,7 +430,8 @@ class AiObjects:
# Create standardized call object
modelCall = AiModelCall(
messages=messages,
- model=model
+ model=model,
+ options=options or {}
)
# Call the model with standardized interface
diff --git a/modules/interfaces/interfaceDbChatObjects.py b/modules/interfaces/interfaceDbChatObjects.py
index 2c952058..31db88d8 100644
--- a/modules/interfaces/interfaceDbChatObjects.py
+++ b/modules/interfaces/interfaceDbChatObjects.py
@@ -873,7 +873,7 @@ class ChatObjects:
stat = ChatStat(**statData)
# Create the stat record in the database
- created = self.db.recordCreate(ChatStat, stat.model_dump())
+ created = self.db.recordCreate(ChatStat, stat)
# Return the created ChatStat
return ChatStat(**created)
@@ -937,7 +937,7 @@ class ChatObjects:
items.append({
"type": "message",
"createdAt": msg_timestamp,
- "item": chat_message.model_dump()
+ "item": chat_message
})
# Get logs
@@ -952,7 +952,7 @@ class ChatObjects:
items.append({
"type": "log",
"createdAt": log_timestamp,
- "item": chat_log.model_dump()
+ "item": chat_log
})
# Get stats list
@@ -966,7 +966,7 @@ class ChatObjects:
items.append({
"type": "stat",
"createdAt": stat_timestamp,
- "item": stat.model_dump()
+ "item": stat
})
# Sort all items by createdAt timestamp for chronological order
diff --git a/modules/routes/routeDataPrompts.py b/modules/routes/routeDataPrompts.py
index 97da7846..51968808 100644
--- a/modules/routes/routeDataPrompts.py
+++ b/modules/routes/routeDataPrompts.py
@@ -48,11 +48,8 @@ async def create_prompt(
"""Create a new prompt"""
managementInterface = interfaceDbComponentObjects.getInterface(currentUser)
- # Convert Prompt to dict for interface
- prompt_data = prompt.model_dump()
-
# Create prompt
- newPrompt = managementInterface.createPrompt(prompt_data)
+ newPrompt = managementInterface.createPrompt(prompt)
return Prompt(**newPrompt)
diff --git a/modules/routes/routeDataUsers.py b/modules/routes/routeDataUsers.py
index 578eeeb0..afd286d6 100644
--- a/modules/routes/routeDataUsers.py
+++ b/modules/routes/routeDataUsers.py
@@ -92,11 +92,8 @@ async def create_user(
"""Create a new user"""
appInterface = interfaceDbAppObjects.getInterface(currentUser)
- # Convert User to dict for interface
- user_dict = user_data.model_dump()
-
# Create user
- newUser = appInterface.createUser(user_dict)
+ newUser = appInterface.createUser(user_data)
return newUser
@@ -119,11 +116,8 @@ async def update_user(
detail=f"User with ID {userId} not found"
)
- # Convert User to dict for interface
- update_data = userData.model_dump()
-
# Update user
- updatedUser = appInterface.updateUser(userId, update_data)
+ updatedUser = appInterface.updateUser(userId, userData)
if not updatedUser:
raise HTTPException(
diff --git a/modules/services/serviceAi/subDocumentProcessing.py b/modules/services/serviceAi/subDocumentProcessing.py
index 726ff62d..8cdca91c 100644
--- a/modules/services/serviceAi/subDocumentProcessing.py
+++ b/modules/services/serviceAi/subDocumentProcessing.py
@@ -5,7 +5,7 @@ import time
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
-from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult
+from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult, ExtractionOptions, MergeStrategy
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
logger = logging.getLogger(__name__)
@@ -33,19 +33,6 @@ class SubDocumentProcessing:
self._extractionService = ExtractionService(self.services)
return self._extractionService
- def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int:
- """Calculate maximum context bytes based on model capabilities and options."""
- if options and options.maxContextBytes:
- return options.maxContextBytes
-
- # Default model capabilities (this should be enhanced with actual model registry)
- defaultMaxTokens = 4000
- safetyMargin = options.safetyMargin if options else 0.1
-
- # Calculate bytes (4 chars per token estimation)
- maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4)
-
- return maxContextBytes
async def processDocumentsPerChunk(
self,
@@ -68,22 +55,23 @@ class SubDocumentProcessing:
if not documents:
return ""
- # Build extraction options WITHOUT chunking parameters
- extractionOptions: Dict[str, Any] = {
- "prompt": prompt,
- "operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
- "processDocumentsIndividually": True,
- # REMOVED: maxSize, textChunkSize, imageChunkSize
- "mergeStrategy": {
- "useIntelligentMerging": True,
- "prompt": prompt,
- "groupBy": "typeGroup",
- "orderBy": "id",
- "mergeType": "concatenate"
- },
- }
+ # Build extraction options using Pydantic model
+ mergeStrategy = MergeStrategy(
+ useIntelligentMerging=True,
+ prompt=prompt,
+ groupBy="typeGroup",
+ orderBy="id",
+ mergeType="concatenate"
+ )
- logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
+ extractionOptions = ExtractionOptions(
+ prompt=prompt,
+ operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
+ processDocumentsIndividually=True,
+ mergeStrategy=mergeStrategy
+ )
+
+ logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
try:
# Extract content WITHOUT chunking
@@ -120,21 +108,23 @@ class SubDocumentProcessing:
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
- # Build extraction options WITHOUT chunking parameters
- extractionOptions: Dict[str, Any] = {
- "prompt": prompt,
- "operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
- "processDocumentsIndividually": True,
- "mergeStrategy": {
- "useIntelligentMerging": True,
- "prompt": prompt,
- "groupBy": "typeGroup",
- "orderBy": "id",
- "mergeType": "concatenate"
- },
- }
+ # Build extraction options using Pydantic model
+ mergeStrategy = MergeStrategy(
+ useIntelligentMerging=True,
+ prompt=prompt,
+ groupBy="typeGroup",
+ orderBy="id",
+ mergeType="concatenate"
+ )
- logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
+ extractionOptions = ExtractionOptions(
+ prompt=prompt,
+ operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
+ processDocumentsIndividually=True,
+ mergeStrategy=mergeStrategy
+ )
+
+ logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
try:
# Extract content WITHOUT chunking
@@ -205,31 +195,25 @@ class SubDocumentProcessing:
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
- # Get model capabilities for size calculation
- model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options)
+ # Build extraction options using Pydantic model (model-aware chunking in AI call phase)
+ mergeStrategy = MergeStrategy(
+ useIntelligentMerging=True,
+ prompt=custom_prompt,
+ groupBy="typeGroup",
+ orderBy="id",
+ mergeType="concatenate"
+ )
- # Build extraction options for chunking with intelligent merging
- extractionOptions: Dict[str, Any] = {
- "prompt": custom_prompt, # Use the custom prompt instead of default
- "operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
- "processDocumentsIndividually": True, # Process each document separately
- "maxSize": model_capabilities["maxContextBytes"],
- "chunkAllowed": True,
- "textChunkSize": model_capabilities["textChunkSize"],
- "imageChunkSize": model_capabilities["imageChunkSize"],
- "imageMaxPixels": 1024 * 1024,
- "imageQuality": 85,
- "mergeStrategy": {
- "useIntelligentMerging": True, # Enable intelligent token-aware merging
- "capabilities": model_capabilities,
- "prompt": custom_prompt, # Use the custom prompt
- "groupBy": "typeGroup",
- "orderBy": "id",
- "mergeType": "concatenate"
- },
- }
+ extractionOptions = ExtractionOptions(
+ prompt=custom_prompt, # Use the custom prompt instead of default
+ operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
+ processDocumentsIndividually=True, # Process each document separately
+ imageMaxPixels=1024 * 1024,
+ imageQuality=85,
+ mergeStrategy=mergeStrategy
+ )
- logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
+ logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
try:
# Extract content with chunking
@@ -1042,15 +1026,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part)
# Use existing merging strategy from options
- merge_strategy = {
- "useIntelligentMerging": True,
- "groupBy": "documentId", # Group by document
- "orderBy": "partIndex", # Order by part index
- "mergeType": "concatenate"
- }
+ merge_strategy = MergeStrategy(
+ useIntelligentMerging=True,
+ groupBy="documentId", # Group by document
+ orderBy="partIndex", # Order by part index
+ mergeType="concatenate"
+ )
- if options and hasattr(options, 'mergeStrategy'):
- merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
@@ -1095,15 +1077,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part)
# Use existing merging strategy for JSON mode
- merge_strategy = {
- "useIntelligentMerging": True,
- "groupBy": "documentId", # Group by document
- "orderBy": "partIndex", # Order by part index
- "mergeType": "concatenate"
- }
+ merge_strategy = MergeStrategy(
+ useIntelligentMerging=True,
+ groupBy="documentId", # Group by document
+ orderBy="partIndex", # Order by part index
+ mergeType="concatenate"
+ )
- if options and hasattr(options, 'mergeStrategy'):
- merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
@@ -1234,15 +1214,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part)
# Use existing merging strategy from options
- merge_strategy = {
- "useIntelligentMerging": True,
- "groupBy": "documentId", # Group by document
- "orderBy": "chunkIndex", # Order by chunk index
- "mergeType": "concatenate"
- }
+ merge_strategy = MergeStrategy(
+ useIntelligentMerging=True,
+ groupBy="documentId", # Group by document
+ orderBy="chunkIndex", # Order by chunk index
+ mergeType="concatenate"
+ )
- if options and hasattr(options, 'mergeStrategy'):
- merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
@@ -1297,15 +1275,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part)
# Use existing merging strategy for clean mode
- merge_strategy = {
- "useIntelligentMerging": True,
- "groupBy": "documentId", # Group by document
- "orderBy": "chunkIndex", # Order by chunk index
- "mergeType": "concatenate"
- }
+ merge_strategy = MergeStrategy(
+ useIntelligentMerging=True,
+ groupBy="documentId", # Group by document
+ orderBy="chunkIndex", # Order by chunk index
+ mergeType="concatenate"
+ )
- if options and hasattr(options, 'mergeStrategy'):
- merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
@@ -1351,15 +1327,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part)
# Use existing merging strategy for JSON mode
- merge_strategy = {
- "useIntelligentMerging": True,
- "groupBy": "documentId", # Group by document
- "orderBy": "chunkIndex", # Order by chunk index
- "mergeType": "concatenate"
- }
+ merge_strategy = MergeStrategy(
+ useIntelligentMerging=True,
+ groupBy="documentId", # Group by document
+ orderBy="chunkIndex", # Order by chunk index
+ mergeType="concatenate"
+ )
- if options and hasattr(options, 'mergeStrategy'):
- merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
@@ -1455,5 +1429,3 @@ CONTINUATION INSTRUCTIONS:
logger.info(f"Merged {len(chunkResults)} chunks using existing sophisticated merging system (JSON mode)")
return merged_document
-
-# REMOVED: _getModelCapabilitiesForContent method - no longer needed with model-aware chunking
diff --git a/modules/services/serviceExtraction/mainServiceExtraction.py b/modules/services/serviceExtraction/mainServiceExtraction.py
index eef95638..62931565 100644
--- a/modules/services/serviceExtraction/mainServiceExtraction.py
+++ b/modules/services/serviceExtraction/mainServiceExtraction.py
@@ -5,7 +5,7 @@ import time
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .subPipeline import runExtraction
-from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy
+from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallResponse
from modules.aicore.aicoreModelRegistry import modelRegistry
@@ -20,7 +20,7 @@ class ExtractionService:
self._extractorRegistry = ExtractorRegistry()
self._chunkerRegistry = ChunkerRegistry()
- def extractContent(self, documents: List[ChatDocument], options: Dict[str, Any]) -> List[ContentExtracted]:
+ def extractContent(self, documents: List[ChatDocument], options: ExtractionOptions) -> List[ContentExtracted]:
"""
Extract content from a list of ChatDocument objects.
@@ -31,6 +31,7 @@ class ExtractionService:
Returns:
List of ContentExtracted objects, one per input document
"""
+
results: List[ContentExtracted] = []
# Lazy import to avoid circular deps and heavy init at module import
diff --git a/modules/services/serviceExtraction/merging/mergerDefault.py b/modules/services/serviceExtraction/merging/mergerDefault.py
index ceab6635..9a7a625a 100644
--- a/modules/services/serviceExtraction/merging/mergerDefault.py
+++ b/modules/services/serviceExtraction/merging/mergerDefault.py
@@ -1,9 +1,9 @@
from typing import Any, Dict, List
-from modules.datamodels.datamodelExtraction import ContentPart
+from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
class DefaultMerger:
- def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
+ def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""
Default merger that passes through parts unchanged.
Used for image, binary, metadata, container typeGroups.
diff --git a/modules/services/serviceExtraction/merging/mergerTable.py b/modules/services/serviceExtraction/merging/mergerTable.py
index 4f62358c..cffce8f1 100644
--- a/modules/services/serviceExtraction/merging/mergerTable.py
+++ b/modules/services/serviceExtraction/merging/mergerTable.py
@@ -1,10 +1,10 @@
from typing import Any, Dict, List
-from modules.datamodels.datamodelExtraction import ContentPart
+from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
from ..subUtils import makeId
class TableMerger:
- def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
+ def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""
Merge table parts based on strategy.
Strategy options:
@@ -15,9 +15,9 @@ class TableMerger:
if not parts:
return parts
- groupBy = strategy.get("groupBy", "parentId")
- maxSize = strategy.get("maxSize", 0)
- combineSheets = strategy.get("combineSheets", False)
+ groupBy = strategy.groupBy
+ maxSize = strategy.maxSize or 0
+ combineSheets = strategy.tableMerge.get("combineSheets", False) if strategy.tableMerge else False
# Group parts
groups = self._groupParts(parts, groupBy, combineSheets)
diff --git a/modules/services/serviceExtraction/merging/mergerText.py b/modules/services/serviceExtraction/merging/mergerText.py
index 38f7c6f0..9e1ccf47 100644
--- a/modules/services/serviceExtraction/merging/mergerText.py
+++ b/modules/services/serviceExtraction/merging/mergerText.py
@@ -1,10 +1,10 @@
from typing import Any, Dict, List
-from modules.datamodels.datamodelExtraction import ContentPart
+from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
from ..subUtils import makeId
class TextMerger:
- def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
+ def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""
Merge text parts based on strategy.
Strategy options:
@@ -15,9 +15,9 @@ class TextMerger:
if not parts:
return parts
- groupBy = strategy.get("groupBy", "parentId")
- orderBy = strategy.get("orderBy", "label")
- maxSize = strategy.get("maxSize", 0)
+ groupBy = strategy.groupBy
+ orderBy = strategy.orderBy
+ maxSize = strategy.maxSize or 0
# Group parts
groups = self._groupParts(parts, groupBy)
diff --git a/modules/services/serviceExtraction/subPipeline.py b/modules/services/serviceExtraction/subPipeline.py
index 9d8193da..e935f3c3 100644
--- a/modules/services/serviceExtraction/subPipeline.py
+++ b/modules/services/serviceExtraction/subPipeline.py
@@ -1,8 +1,7 @@
-from typing import Any, Dict, List
+from typing import List
import logging
-import os
-from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
+from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy
from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .merging.mergerText import TextMerger
@@ -13,13 +12,13 @@ from .subMerger import IntelligentTokenAwareMerger
logger = logging.getLogger(__name__)
-def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List[ContentPart]:
+def _mergeParts(parts: List[ContentPart], mergeStrategy: MergeStrategy) -> List[ContentPart]:
"""Merge parts based on the provided strategy."""
if not parts or not mergeStrategy:
return parts
- groupBy = mergeStrategy.get("groupBy", "typeGroup")
- orderBy = mergeStrategy.get("orderBy", "id")
+ groupBy = mergeStrategy.groupBy
+ orderBy = mergeStrategy.orderBy
# Group parts by the specified field
groups = {}
@@ -56,7 +55,8 @@ def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List
return merged_parts
-def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: Dict[str, Any]) -> ContentExtracted:
+def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted:
+
extractor = extractorRegistry.resolve(mimeType, fileName)
if extractor is None:
# fallback: single binary part
@@ -71,15 +71,14 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
)
return ContentExtracted(id=makeId(), parts=[part])
- parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options})
+ parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType})
# REMOVED: poolAndLimit(parts, chunkerRegistry, options)
# REMOVED: Chunking logic - now handled in AI call phase
# Apply merging strategy if provided (preserve existing logic)
- mergeStrategy = options.get("mergeStrategy", {})
- if mergeStrategy:
- parts = _applyMerging(parts, mergeStrategy)
+ if options.mergeStrategy:
+ parts = _applyMerging(parts, options.mergeStrategy)
return ContentExtracted(id=makeId(), parts=parts)
@@ -87,17 +86,17 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
-def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
+def _applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""Apply merging strategy to parts with intelligent token-aware merging."""
logger.debug(f"_applyMerging called with {len(parts)} parts")
# Check if intelligent merging is enabled
- if strategy.get("useIntelligentMerging", False):
- model_capabilities = strategy.get("capabilities", {})
+ if strategy.useIntelligentMerging:
+ model_capabilities = strategy.capabilities or {}
subMerger = IntelligentTokenAwareMerger(model_capabilities)
# Use intelligent merging for all parts
- merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
+ merged = subMerger.merge_chunks_intelligently(parts, strategy.prompt or "")
# Calculate and log optimization stats
stats = subMerger.calculate_optimization_stats(parts, merged)
diff --git a/test_ai_model_selection.py b/test_ai_model_selection.py
index ea8a6798..62324cba 100644
--- a/test_ai_model_selection.py
+++ b/test_ai_model_selection.py
@@ -90,7 +90,7 @@ class ModelSelectionTester:
totalScore = sizeRating + processingModeRating + priorityRating
print(
- f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
+ f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
)
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
@@ -136,7 +136,7 @@ class ModelSelectionTester:
totalScore = sizeRating + processingModeRating + priorityRating
print(
- f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
+ f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
)
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
@@ -365,8 +365,8 @@ class ModelSelectionTester:
)
if failoverModelList:
- print(f" Selected model: {failoverModelList[0].name}")
- print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
+ print(f" Selected model: {failoverModelList[0].displayName}")
+ print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else:
print(" No suitable models found")
@@ -393,8 +393,8 @@ class ModelSelectionTester:
)
if failoverModelList:
- print(f" Selected model: {failoverModelList[0].name}")
- print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
+ print(f" Selected model: {failoverModelList[0].displayName}")
+ print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else:
print(" No suitable models found")
@@ -421,8 +421,8 @@ class ModelSelectionTester:
)
if failoverModelList:
- print(f" Selected model: {failoverModelList[0].name}")
- print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
+ print(f" Selected model: {failoverModelList[0].displayName}")
+ print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else:
print(" No suitable models found")
@@ -449,8 +449,8 @@ class ModelSelectionTester:
)
if failoverModelList:
- print(f" Selected model: {failoverModelList[0].name}")
- print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
+ print(f" Selected model: {failoverModelList[0].displayName}")
+ print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else:
print(" No suitable models found")
@@ -479,15 +479,15 @@ class ModelSelectionTester:
print(f" {connector_type}: {len(models)} models")
for model in models:
capabilities = getattr(model, 'capabilities', [])
- print(f" - {model.name}: {capabilities}")
+ print(f" - {model.displayName}: {capabilities}")
# Show operation type support
print(f"\nOperation type support:")
for op_type in OperationTypeEnum:
- supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and op_type in m.operationTypes]
+ supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and any(ot.operationType == op_type for ot in m.operationTypes)]
print(f" {op_type.name}: {len(supported_models)} models")
if supported_models:
- model_names = [m.name for m in supported_models[:3]] # Show first 3 models
+ model_names = [m.displayName for m in supported_models[:3]] # Show first 3 models
print(f" Models: {', '.join(model_names)}")