cleanup pydantic v2, unnecessary pdantic to dict convesrions, unnecessary unions removed with clean classes

This commit is contained in:
ValueOn AG 2025-10-24 22:46:05 +02:00
parent 4f7bba5f33
commit 8523da7fe2
15 changed files with 188 additions and 319 deletions

View file

@ -50,7 +50,7 @@ class AiAnthropic(BaseConnectorAi):
connectorType="anthropic", connectorType="anthropic",
apiUrl="https://api.anthropic.com/v1/messages", apiUrl="https://api.anthropic.com/v1/messages",
temperature=0.2, temperature=0.2,
maxTokens=200000, maxTokens=8192,
contextLength=200000, contextLength=200000,
costPer1kTokensInput=0.015, costPer1kTokensInput=0.015,
costPer1kTokensOutput=0.075, costPer1kTokensOutput=0.075,
@ -75,7 +75,7 @@ class AiAnthropic(BaseConnectorAi):
connectorType="anthropic", connectorType="anthropic",
apiUrl="https://api.anthropic.com/v1/messages", apiUrl="https://api.anthropic.com/v1/messages",
temperature=0.2, temperature=0.2,
maxTokens=200000, maxTokens=8192,
contextLength=200000, contextLength=200000,
costPer1kTokensInput=0.015, costPer1kTokensInput=0.015,
costPer1kTokensOutput=0.075, costPer1kTokensOutput=0.075,

View file

@ -720,22 +720,9 @@ class DatabaseConnector:
logger.info(f"Initial ID {initialId} for table {table} registered") logger.info(f"Initial ID {initialId} for table {table} registered")
return success return success
else: else:
# Check if the existing initial ID still exists in the table # Table already has an initial ID registered
existingInitialId = systemData[table] logger.debug(f"Table {table} already has initial ID {systemData[table]}")
records = self.getRecordset( return True
model_class, recordFilter={"id": existingInitialId}
)
if not records:
# The initial record no longer exists, update to the new one
systemData[table] = initialId
success = self._saveSystemTable(systemData)
if success:
logger.info(
f"Initial ID updated from {existingInitialId} to {initialId} for table {table}"
)
return success
else:
return True
except Exception as e: except Exception as e:
logger.error(f"Error registering the initial ID for table {table}: {e}") logger.error(f"Error registering the initial ID for table {table}: {e}")
return False return False

View file

@ -135,7 +135,6 @@ class AiCallOptions(BaseModel):
compressPrompt: bool = Field(default=True, description="Whether to compress the prompt") compressPrompt: bool = Field(default=True, description="Whether to compress the prompt")
compressContext: bool = Field(default=True, description="If False: process each chunk; If True: summarize and work on summary") compressContext: bool = Field(default=True, description="If False: process each chunk; If True: summarize and work on summary")
processDocumentsIndividually: bool = Field(default=True, description="If True, process each document separately; else pool docs") processDocumentsIndividually: bool = Field(default=True, description="If True, process each document separately; else pool docs")
maxContextBytes: Optional[int] = Field(default=None, description="Hard cap for extracted context size passed to the model")
maxCost: Optional[float] = Field(default=None, description="Max cost budget") maxCost: Optional[float] = Field(default=None, description="Max cost budget")
maxProcessingTime: Optional[int] = Field(default=None, description="Max processing time in seconds") maxProcessingTime: Optional[int] = Field(default=None, description="Max processing time in seconds")
processingMode: ProcessingModeEnum = Field(default=ProcessingModeEnum.BASIC, description="Processing mode") processingMode: ProcessingModeEnum = Field(default=ProcessingModeEnum.BASIC, description="Processing mode")
@ -145,7 +144,6 @@ class AiCallOptions(BaseModel):
# Model generation parameters # Model generation parameters
temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0, description="Temperature for response generation (0.0-2.0, lower = more consistent)") temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0, description="Temperature for response generation (0.0-2.0, lower = more consistent)")
maxTokens: Optional[int] = Field(default=None, ge=1, le=32000, description="Maximum tokens in response")
maxParts: Optional[int] = Field(default=1000, ge=1, le=1000, description="Maximum number of continuation parts to fetch") maxParts: Optional[int] = Field(default=1000, ge=1, le=1000, description="Maximum number of continuation parts to fetch")
@ -170,32 +168,6 @@ class AiCallResponse(BaseModel):
errorCount: int = Field(default=0, description="0 for success, 1+ for errors") errorCount: int = Field(default=0, description="0 for success, 1+ for errors")
class EnhancedAiCallOptions(AiCallOptions):
"""Enhanced options for improved document processing with chunk mapping."""
# Parallel processing
enableParallelProcessing: bool = Field(
default=True,
description="Enable parallel processing of chunks"
)
maxConcurrentChunks: int = Field(
default=5,
ge=1,
le=20,
description="Maximum number of chunks to process concurrently"
)
# Chunk mapping
preserveChunkMetadata: bool = Field(
default=True,
description="Preserve chunk metadata during processing"
)
chunkSeparator: str = Field(
default="\n\n---\n\n",
description="Separator between chunks in merged output"
)
class AiModelCall(BaseModel): class AiModelCall(BaseModel):
"""Standardized input for AI model calls.""" """Standardized input for AI model calls."""

View file

@ -1,6 +1,9 @@
from typing import Any, Dict, List, Optional, Literal from typing import Any, Dict, List, Optional, Literal, TYPE_CHECKING
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
if TYPE_CHECKING:
from modules.datamodels.datamodelAi import OperationTypeEnum
class ContentPart(BaseModel): class ContentPart(BaseModel):
id: str = Field(description="Unique content part identifier") id: str = Field(description="Unique content part identifier")
@ -40,106 +43,49 @@ class PartResult(BaseModel):
class MergeStrategy(BaseModel): class MergeStrategy(BaseModel):
"""Strategy configuration for merging content parts and AI results.""" """Strategy configuration for merging content parts and AI results."""
groupBy: str = Field(default="typeGroup", description="Field to group parts by (typeGroup, parentId, label, etc.)")
orderBy: str = Field(default="id", description="Field to order parts within groups (id, order, pageIndex, etc.)")
mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(default="concatenate", description="How to merge content within groups")
maxSize: Optional[int] = Field(default=None, description="Maximum size for merged content in bytes")
textMerge: Optional[Dict[str, Any]] = Field(default=None, description="Text-specific merge settings (separator, formatting, etc.)")
tableMerge: Optional[Dict[str, Any]] = Field(default=None, description="Table-specific merge settings (header handling, etc.)")
structureMerge: Optional[Dict[str, Any]] = Field(default=None, description="Structure-specific merge settings (hierarchy, etc.)")
aiResultMerge: Optional[Dict[str, Any]] = Field(default=None, description="AI result merging settings (prompt, context, etc.)")
preserveChunks: bool = Field(default=False, description="Whether to preserve individual chunks or merge them")
chunkSeparator: str = Field(default="\n\n---\n\n", description="Separator between chunks when merging")
preserveMetadata: bool = Field(default=True, description="Whether to preserve metadata from original parts")
metadataFields: Optional[List[str]] = Field(default=None, description="Specific metadata fields to preserve (None = all)")
onError: Literal["skip", "include", "fail"] = Field(default="skip", description="How to handle errors during merging")
validateContent: bool = Field(default=True, description="Whether to validate content before merging")
useIntelligentMerging: bool = Field(default=False, description="Whether to use intelligent token-aware merging")
prompt: Optional[str] = Field(default=None, description="Prompt for intelligent merging")
capabilities: Optional[Dict[str, Any]] = Field(default=None, description="Model capabilities for intelligent merging")
# Grouping configuration
groupBy: str = Field(
default="typeGroup",
description="Field to group parts by (typeGroup, parentId, label, etc.)"
)
# Ordering configuration
orderBy: str = Field(
default="id",
description="Field to order parts within groups (id, order, pageIndex, etc.)"
)
# Merge behavior
mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(
default="concatenate",
description="How to merge content within groups"
)
# Size limits
maxSize: Optional[int] = Field(
default=None,
description="Maximum size for merged content in bytes"
)
# Type-specific merge settings
textMerge: Optional[Dict[str, Any]] = Field(
default=None,
description="Text-specific merge settings (separator, formatting, etc.)"
)
tableMerge: Optional[Dict[str, Any]] = Field(
default=None,
description="Table-specific merge settings (header handling, etc.)"
)
structureMerge: Optional[Dict[str, Any]] = Field(
default=None,
description="Structure-specific merge settings (hierarchy, etc.)"
)
# AI result merging
aiResultMerge: Optional[Dict[str, Any]] = Field(
default=None,
description="AI result merging settings (prompt, context, etc.)"
)
# Chunk handling
preserveChunks: bool = Field(
default=False,
description="Whether to preserve individual chunks or merge them"
)
chunkSeparator: str = Field(
default="\n\n---\n\n",
description="Separator between chunks when merging"
)
# Metadata handling
preserveMetadata: bool = Field(
default=True,
description="Whether to preserve metadata from original parts"
)
metadataFields: Optional[List[str]] = Field(
default=None,
description="Specific metadata fields to preserve (None = all)"
)
# Error handling
onError: Literal["skip", "include", "fail"] = Field(
default="skip",
description="How to handle errors during merging"
)
# Validation
validateContent: bool = Field(
default=True,
description="Whether to validate content before merging"
)
def getTypeSpecificSettings(self, typeGroup: str) -> Dict[str, Any]:
"""Get type-specific merge settings for a content type."""
if typeGroup == "text" and self.textMerge:
return self.textMerge
elif typeGroup == "table" and self.tableMerge:
return self.tableMerge
elif typeGroup == "structure" and self.structureMerge:
return self.structureMerge
else:
return {}
def shouldPreserveChunk(self, chunk: Dict[str, Any]) -> bool:
"""Determine if a chunk should be preserved based on strategy."""
if not self.preserveChunks:
return False
# Check if chunk has error metadata
if self.onError == "skip" and chunk.get("metadata", {}).get("error"):
return False
return True
class ExtractionOptions(BaseModel):
"""Options for document extraction and processing with clear data structures."""
# Core extraction parameters
prompt: str = Field(description="Extraction prompt for AI processing")
operationType: 'OperationTypeEnum' = Field(description="Type of operation for AI processing")
processDocumentsIndividually: bool = Field(default=True, description="Process each document separately")
# Image processing parameters
imageMaxPixels: int = Field(default=1024 * 1024, ge=1, description="Maximum pixels for image processing")
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
# Merging strategy
mergeStrategy: MergeStrategy = Field(description="Strategy for merging extraction results")
# Optional chunking parameters (for backward compatibility)
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")
maxSize: Optional[int] = Field(default=None, description="Maximum size for processing")
textChunkSize: Optional[int] = Field(default=None, description="Size for text chunks")
imageChunkSize: Optional[int] = Field(default=None, description="Size for image chunks")
# Additional processing options
enableParallelProcessing: bool = Field(default=True, description="Enable parallel processing of chunks")
maxConcurrentChunks: int = Field(default=5, ge=1, le=20, description="Maximum number of chunks to process concurrently")
class Config:
arbitraryTypesAllowed = True # Allow OperationTypeEnum import

View file

@ -127,7 +127,7 @@ class AiObjects:
logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})") logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
# Call the model # Call the model
response = await self._callWithModel(model, prompt, context) response = await self._callWithModel(model, prompt, context, options)
logger.info(f"✅ AI call successful with model: {model.name}") logger.info(f"✅ AI call successful with model: {model.name}")
return response return response
@ -204,7 +204,7 @@ class AiObjects:
if partSize <= modelContextBytes: if partSize <= modelContextBytes:
# Part fits - call AI directly # Part fits - call AI directly
response = await self._callWithModel(model, prompt, contentPart.data) response = await self._callWithModel(model, prompt, contentPart.data, options)
logger.info(f"✅ Content part processed successfully with model: {model.name}") logger.info(f"✅ Content part processed successfully with model: {model.name}")
return response return response
else: else:
@ -216,7 +216,7 @@ class AiObjects:
# Process each chunk # Process each chunk
chunkResults = [] chunkResults = []
for chunk in chunks: for chunk in chunks:
chunkResponse = await self._callWithModel(model, prompt, chunk['data']) chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
chunkResults.append(chunkResponse) chunkResults.append(chunkResponse)
# Merge chunk results # Merge chunk results
@ -393,7 +393,7 @@ class AiObjects:
errorCount=1 errorCount=1
) )
async def _callWithModel(self, model: AiModel, prompt: str, context: str) -> AiCallResponse: async def _callWithModel(self, model: AiModel, prompt: str, context: str, options: AiCallOptions = None) -> AiCallResponse:
"""Call a specific model and return the response.""" """Call a specific model and return the response."""
# Calculate input bytes from prompt and context # Calculate input bytes from prompt and context
inputBytes = len((prompt + context).encode('utf-8')) inputBytes = len((prompt + context).encode('utf-8'))
@ -430,7 +430,8 @@ class AiObjects:
# Create standardized call object # Create standardized call object
modelCall = AiModelCall( modelCall = AiModelCall(
messages=messages, messages=messages,
model=model model=model,
options=options or {}
) )
# Call the model with standardized interface # Call the model with standardized interface

View file

@ -873,7 +873,7 @@ class ChatObjects:
stat = ChatStat(**statData) stat = ChatStat(**statData)
# Create the stat record in the database # Create the stat record in the database
created = self.db.recordCreate(ChatStat, stat.model_dump()) created = self.db.recordCreate(ChatStat, stat)
# Return the created ChatStat # Return the created ChatStat
return ChatStat(**created) return ChatStat(**created)
@ -937,7 +937,7 @@ class ChatObjects:
items.append({ items.append({
"type": "message", "type": "message",
"createdAt": msg_timestamp, "createdAt": msg_timestamp,
"item": chat_message.model_dump() "item": chat_message
}) })
# Get logs # Get logs
@ -952,7 +952,7 @@ class ChatObjects:
items.append({ items.append({
"type": "log", "type": "log",
"createdAt": log_timestamp, "createdAt": log_timestamp,
"item": chat_log.model_dump() "item": chat_log
}) })
# Get stats list # Get stats list
@ -966,7 +966,7 @@ class ChatObjects:
items.append({ items.append({
"type": "stat", "type": "stat",
"createdAt": stat_timestamp, "createdAt": stat_timestamp,
"item": stat.model_dump() "item": stat
}) })
# Sort all items by createdAt timestamp for chronological order # Sort all items by createdAt timestamp for chronological order

View file

@ -48,11 +48,8 @@ async def create_prompt(
"""Create a new prompt""" """Create a new prompt"""
managementInterface = interfaceDbComponentObjects.getInterface(currentUser) managementInterface = interfaceDbComponentObjects.getInterface(currentUser)
# Convert Prompt to dict for interface
prompt_data = prompt.model_dump()
# Create prompt # Create prompt
newPrompt = managementInterface.createPrompt(prompt_data) newPrompt = managementInterface.createPrompt(prompt)
return Prompt(**newPrompt) return Prompt(**newPrompt)

View file

@ -92,11 +92,8 @@ async def create_user(
"""Create a new user""" """Create a new user"""
appInterface = interfaceDbAppObjects.getInterface(currentUser) appInterface = interfaceDbAppObjects.getInterface(currentUser)
# Convert User to dict for interface
user_dict = user_data.model_dump()
# Create user # Create user
newUser = appInterface.createUser(user_dict) newUser = appInterface.createUser(user_data)
return newUser return newUser
@ -119,11 +116,8 @@ async def update_user(
detail=f"User with ID {userId} not found" detail=f"User with ID {userId} not found"
) )
# Convert User to dict for interface
update_data = userData.model_dump()
# Update user # Update user
updatedUser = appInterface.updateUser(userId, update_data) updatedUser = appInterface.updateUser(userId, userData)
if not updatedUser: if not updatedUser:
raise HTTPException( raise HTTPException(

View file

@ -5,7 +5,7 @@ import time
from typing import Dict, Any, List, Optional, Tuple, Union from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import ChatDocument from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult, ExtractionOptions, MergeStrategy
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -33,19 +33,6 @@ class SubDocumentProcessing:
self._extractionService = ExtractionService(self.services) self._extractionService = ExtractionService(self.services)
return self._extractionService return self._extractionService
def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int:
"""Calculate maximum context bytes based on model capabilities and options."""
if options and options.maxContextBytes:
return options.maxContextBytes
# Default model capabilities (this should be enhanced with actual model registry)
defaultMaxTokens = 4000
safetyMargin = options.safetyMargin if options else 0.1
# Calculate bytes (4 chars per token estimation)
maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4)
return maxContextBytes
async def processDocumentsPerChunk( async def processDocumentsPerChunk(
self, self,
@ -68,22 +55,23 @@ class SubDocumentProcessing:
if not documents: if not documents:
return "" return ""
# Build extraction options WITHOUT chunking parameters # Build extraction options using Pydantic model
extractionOptions: Dict[str, Any] = { mergeStrategy = MergeStrategy(
"prompt": prompt, useIntelligentMerging=True,
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT, prompt=prompt,
"processDocumentsIndividually": True, groupBy="typeGroup",
# REMOVED: maxSize, textChunkSize, imageChunkSize orderBy="id",
"mergeStrategy": { mergeType="concatenate"
"useIntelligentMerging": True, )
"prompt": prompt,
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") extractionOptions = ExtractionOptions(
prompt=prompt,
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
processDocumentsIndividually=True,
mergeStrategy=mergeStrategy
)
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
try: try:
# Extract content WITHOUT chunking # Extract content WITHOUT chunking
@ -120,21 +108,23 @@ class SubDocumentProcessing:
if not documents: if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []} return {"metadata": {"title": "Empty Document"}, "sections": []}
# Build extraction options WITHOUT chunking parameters # Build extraction options using Pydantic model
extractionOptions: Dict[str, Any] = { mergeStrategy = MergeStrategy(
"prompt": prompt, useIntelligentMerging=True,
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT, prompt=prompt,
"processDocumentsIndividually": True, groupBy="typeGroup",
"mergeStrategy": { orderBy="id",
"useIntelligentMerging": True, mergeType="concatenate"
"prompt": prompt, )
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") extractionOptions = ExtractionOptions(
prompt=prompt,
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
processDocumentsIndividually=True,
mergeStrategy=mergeStrategy
)
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
try: try:
# Extract content WITHOUT chunking # Extract content WITHOUT chunking
@ -205,31 +195,25 @@ class SubDocumentProcessing:
if not documents: if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []} return {"metadata": {"title": "Empty Document"}, "sections": []}
# Get model capabilities for size calculation # Build extraction options using Pydantic model (model-aware chunking in AI call phase)
model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options) mergeStrategy = MergeStrategy(
useIntelligentMerging=True,
prompt=custom_prompt,
groupBy="typeGroup",
orderBy="id",
mergeType="concatenate"
)
# Build extraction options for chunking with intelligent merging extractionOptions = ExtractionOptions(
extractionOptions: Dict[str, Any] = { prompt=custom_prompt, # Use the custom prompt instead of default
"prompt": custom_prompt, # Use the custom prompt instead of default operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT, processDocumentsIndividually=True, # Process each document separately
"processDocumentsIndividually": True, # Process each document separately imageMaxPixels=1024 * 1024,
"maxSize": model_capabilities["maxContextBytes"], imageQuality=85,
"chunkAllowed": True, mergeStrategy=mergeStrategy
"textChunkSize": model_capabilities["textChunkSize"], )
"imageChunkSize": model_capabilities["imageChunkSize"],
"imageMaxPixels": 1024 * 1024,
"imageQuality": 85,
"mergeStrategy": {
"useIntelligentMerging": True, # Enable intelligent token-aware merging
"capabilities": model_capabilities,
"prompt": custom_prompt, # Use the custom prompt
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
try: try:
# Extract content with chunking # Extract content with chunking
@ -1042,15 +1026,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part) content_parts.append(content_part)
# Use existing merging strategy from options # Use existing merging strategy from options
merge_strategy = { merge_strategy = MergeStrategy(
"useIntelligentMerging": True, useIntelligentMerging=True,
"groupBy": "documentId", # Group by document groupBy="documentId", # Group by document
"orderBy": "partIndex", # Order by part index orderBy="partIndex", # Order by part index
"mergeType": "concatenate" mergeType="concatenate"
} )
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system # Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1095,15 +1077,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part) content_parts.append(content_part)
# Use existing merging strategy for JSON mode # Use existing merging strategy for JSON mode
merge_strategy = { merge_strategy = MergeStrategy(
"useIntelligentMerging": True, useIntelligentMerging=True,
"groupBy": "documentId", # Group by document groupBy="documentId", # Group by document
"orderBy": "partIndex", # Order by part index orderBy="partIndex", # Order by part index
"mergeType": "concatenate" mergeType="concatenate"
} )
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system # Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1234,15 +1214,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part) content_parts.append(content_part)
# Use existing merging strategy from options # Use existing merging strategy from options
merge_strategy = { merge_strategy = MergeStrategy(
"useIntelligentMerging": True, useIntelligentMerging=True,
"groupBy": "documentId", # Group by document groupBy="documentId", # Group by document
"orderBy": "chunkIndex", # Order by chunk index orderBy="chunkIndex", # Order by chunk index
"mergeType": "concatenate" mergeType="concatenate"
} )
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system # Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1297,15 +1275,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part) content_parts.append(content_part)
# Use existing merging strategy for clean mode # Use existing merging strategy for clean mode
merge_strategy = { merge_strategy = MergeStrategy(
"useIntelligentMerging": True, useIntelligentMerging=True,
"groupBy": "documentId", # Group by document groupBy="documentId", # Group by document
"orderBy": "chunkIndex", # Order by chunk index orderBy="chunkIndex", # Order by chunk index
"mergeType": "concatenate" mergeType="concatenate"
} )
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system # Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1351,15 +1327,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part) content_parts.append(content_part)
# Use existing merging strategy for JSON mode # Use existing merging strategy for JSON mode
merge_strategy = { merge_strategy = MergeStrategy(
"useIntelligentMerging": True, useIntelligentMerging=True,
"groupBy": "documentId", # Group by document groupBy="documentId", # Group by document
"orderBy": "chunkIndex", # Order by chunk index orderBy="chunkIndex", # Order by chunk index
"mergeType": "concatenate" mergeType="concatenate"
} )
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system # Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1455,5 +1429,3 @@ CONTINUATION INSTRUCTIONS:
logger.info(f"Merged {len(chunkResults)} chunks using existing sophisticated merging system (JSON mode)") logger.info(f"Merged {len(chunkResults)} chunks using existing sophisticated merging system (JSON mode)")
return merged_document return merged_document
# REMOVED: _getModelCapabilitiesForContent method - no longer needed with model-aware chunking

View file

@ -5,7 +5,7 @@ import time
from .subRegistry import ExtractorRegistry, ChunkerRegistry from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .subPipeline import runExtraction from .subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions
from modules.datamodels.datamodelChat import ChatDocument from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallResponse from modules.datamodels.datamodelAi import AiCallResponse
from modules.aicore.aicoreModelRegistry import modelRegistry from modules.aicore.aicoreModelRegistry import modelRegistry
@ -20,7 +20,7 @@ class ExtractionService:
self._extractorRegistry = ExtractorRegistry() self._extractorRegistry = ExtractorRegistry()
self._chunkerRegistry = ChunkerRegistry() self._chunkerRegistry = ChunkerRegistry()
def extractContent(self, documents: List[ChatDocument], options: Dict[str, Any]) -> List[ContentExtracted]: def extractContent(self, documents: List[ChatDocument], options: ExtractionOptions) -> List[ContentExtracted]:
""" """
Extract content from a list of ChatDocument objects. Extract content from a list of ChatDocument objects.
@ -31,6 +31,7 @@ class ExtractionService:
Returns: Returns:
List of ContentExtracted objects, one per input document List of ContentExtracted objects, one per input document
""" """
results: List[ContentExtracted] = [] results: List[ContentExtracted] = []
# Lazy import to avoid circular deps and heavy init at module import # Lazy import to avoid circular deps and heavy init at module import

View file

@ -1,9 +1,9 @@
from typing import Any, Dict, List from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
class DefaultMerger: class DefaultMerger:
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]: def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
""" """
Default merger that passes through parts unchanged. Default merger that passes through parts unchanged.
Used for image, binary, metadata, container typeGroups. Used for image, binary, metadata, container typeGroups.

View file

@ -1,10 +1,10 @@
from typing import Any, Dict, List from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
from ..subUtils import makeId from ..subUtils import makeId
class TableMerger: class TableMerger:
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]: def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
""" """
Merge table parts based on strategy. Merge table parts based on strategy.
Strategy options: Strategy options:
@ -15,9 +15,9 @@ class TableMerger:
if not parts: if not parts:
return parts return parts
groupBy = strategy.get("groupBy", "parentId") groupBy = strategy.groupBy
maxSize = strategy.get("maxSize", 0) maxSize = strategy.maxSize or 0
combineSheets = strategy.get("combineSheets", False) combineSheets = strategy.tableMerge.get("combineSheets", False) if strategy.tableMerge else False
# Group parts # Group parts
groups = self._groupParts(parts, groupBy, combineSheets) groups = self._groupParts(parts, groupBy, combineSheets)

View file

@ -1,10 +1,10 @@
from typing import Any, Dict, List from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
from ..subUtils import makeId from ..subUtils import makeId
class TextMerger: class TextMerger:
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]: def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
""" """
Merge text parts based on strategy. Merge text parts based on strategy.
Strategy options: Strategy options:
@ -15,9 +15,9 @@ class TextMerger:
if not parts: if not parts:
return parts return parts
groupBy = strategy.get("groupBy", "parentId") groupBy = strategy.groupBy
orderBy = strategy.get("orderBy", "label") orderBy = strategy.orderBy
maxSize = strategy.get("maxSize", 0) maxSize = strategy.maxSize or 0
# Group parts # Group parts
groups = self._groupParts(parts, groupBy) groups = self._groupParts(parts, groupBy)

View file

@ -1,8 +1,7 @@
from typing import Any, Dict, List from typing import List
import logging import logging
import os
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy
from .subUtils import makeId from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .merging.mergerText import TextMerger from .merging.mergerText import TextMerger
@ -13,13 +12,13 @@ from .subMerger import IntelligentTokenAwareMerger
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List[ContentPart]: def _mergeParts(parts: List[ContentPart], mergeStrategy: MergeStrategy) -> List[ContentPart]:
"""Merge parts based on the provided strategy.""" """Merge parts based on the provided strategy."""
if not parts or not mergeStrategy: if not parts or not mergeStrategy:
return parts return parts
groupBy = mergeStrategy.get("groupBy", "typeGroup") groupBy = mergeStrategy.groupBy
orderBy = mergeStrategy.get("orderBy", "id") orderBy = mergeStrategy.orderBy
# Group parts by the specified field # Group parts by the specified field
groups = {} groups = {}
@ -56,7 +55,8 @@ def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List
return merged_parts return merged_parts
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: Dict[str, Any]) -> ContentExtracted: def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted:
extractor = extractorRegistry.resolve(mimeType, fileName) extractor = extractorRegistry.resolve(mimeType, fileName)
if extractor is None: if extractor is None:
# fallback: single binary part # fallback: single binary part
@ -71,15 +71,14 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
) )
return ContentExtracted(id=makeId(), parts=[part]) return ContentExtracted(id=makeId(), parts=[part])
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options}) parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType})
# REMOVED: poolAndLimit(parts, chunkerRegistry, options) # REMOVED: poolAndLimit(parts, chunkerRegistry, options)
# REMOVED: Chunking logic - now handled in AI call phase # REMOVED: Chunking logic - now handled in AI call phase
# Apply merging strategy if provided (preserve existing logic) # Apply merging strategy if provided (preserve existing logic)
mergeStrategy = options.get("mergeStrategy", {}) if options.mergeStrategy:
if mergeStrategy: parts = _applyMerging(parts, options.mergeStrategy)
parts = _applyMerging(parts, mergeStrategy)
return ContentExtracted(id=makeId(), parts=parts) return ContentExtracted(id=makeId(), parts=parts)
@ -87,17 +86,17 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
# REMOVED: poolAndLimit function - chunking now handled in AI call phase # REMOVED: poolAndLimit function - chunking now handled in AI call phase
def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]: def _applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""Apply merging strategy to parts with intelligent token-aware merging.""" """Apply merging strategy to parts with intelligent token-aware merging."""
logger.debug(f"_applyMerging called with {len(parts)} parts") logger.debug(f"_applyMerging called with {len(parts)} parts")
# Check if intelligent merging is enabled # Check if intelligent merging is enabled
if strategy.get("useIntelligentMerging", False): if strategy.useIntelligentMerging:
model_capabilities = strategy.get("capabilities", {}) model_capabilities = strategy.capabilities or {}
subMerger = IntelligentTokenAwareMerger(model_capabilities) subMerger = IntelligentTokenAwareMerger(model_capabilities)
# Use intelligent merging for all parts # Use intelligent merging for all parts
merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", "")) merged = subMerger.merge_chunks_intelligently(parts, strategy.prompt or "")
# Calculate and log optimization stats # Calculate and log optimization stats
stats = subMerger.calculate_optimization_stats(parts, merged) stats = subMerger.calculate_optimization_stats(parts, merged)

View file

@ -90,7 +90,7 @@ class ModelSelectionTester:
totalScore = sizeRating + processingModeRating + priorityRating totalScore = sizeRating + processingModeRating + priorityRating
print( print(
f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}" f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
) )
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}") print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
@ -136,7 +136,7 @@ class ModelSelectionTester:
totalScore = sizeRating + processingModeRating + priorityRating totalScore = sizeRating + processingModeRating + priorityRating
print( print(
f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}" f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
) )
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}") print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
@ -365,8 +365,8 @@ class ModelSelectionTester:
) )
if failoverModelList: if failoverModelList:
print(f" Selected model: {failoverModelList[0].name}") print(f" Selected model: {failoverModelList[0].displayName}")
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}") print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else: else:
print(" No suitable models found") print(" No suitable models found")
@ -393,8 +393,8 @@ class ModelSelectionTester:
) )
if failoverModelList: if failoverModelList:
print(f" Selected model: {failoverModelList[0].name}") print(f" Selected model: {failoverModelList[0].displayName}")
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}") print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else: else:
print(" No suitable models found") print(" No suitable models found")
@ -421,8 +421,8 @@ class ModelSelectionTester:
) )
if failoverModelList: if failoverModelList:
print(f" Selected model: {failoverModelList[0].name}") print(f" Selected model: {failoverModelList[0].displayName}")
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}") print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else: else:
print(" No suitable models found") print(" No suitable models found")
@ -449,8 +449,8 @@ class ModelSelectionTester:
) )
if failoverModelList: if failoverModelList:
print(f" Selected model: {failoverModelList[0].name}") print(f" Selected model: {failoverModelList[0].displayName}")
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}") print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else: else:
print(" No suitable models found") print(" No suitable models found")
@ -479,15 +479,15 @@ class ModelSelectionTester:
print(f" {connector_type}: {len(models)} models") print(f" {connector_type}: {len(models)} models")
for model in models: for model in models:
capabilities = getattr(model, 'capabilities', []) capabilities = getattr(model, 'capabilities', [])
print(f" - {model.name}: {capabilities}") print(f" - {model.displayName}: {capabilities}")
# Show operation type support # Show operation type support
print(f"\nOperation type support:") print(f"\nOperation type support:")
for op_type in OperationTypeEnum: for op_type in OperationTypeEnum:
supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and op_type in m.operationTypes] supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and any(ot.operationType == op_type for ot in m.operationTypes)]
print(f" {op_type.name}: {len(supported_models)} models") print(f" {op_type.name}: {len(supported_models)} models")
if supported_models: if supported_models:
model_names = [m.name for m in supported_models[:3]] # Show first 3 models model_names = [m.displayName for m in supported_models[:3]] # Show first 3 models
print(f" Models: {', '.join(model_names)}") print(f" Models: {', '.join(model_names)}")