cleanup pydantic v2, unnecessary pdantic to dict convesrions, unnecessary unions removed with clean classes

This commit is contained in:
ValueOn AG 2025-10-24 22:46:05 +02:00
parent 4f7bba5f33
commit 8523da7fe2
15 changed files with 188 additions and 319 deletions

View file

@ -50,7 +50,7 @@ class AiAnthropic(BaseConnectorAi):
connectorType="anthropic",
apiUrl="https://api.anthropic.com/v1/messages",
temperature=0.2,
maxTokens=200000,
maxTokens=8192,
contextLength=200000,
costPer1kTokensInput=0.015,
costPer1kTokensOutput=0.075,
@ -75,7 +75,7 @@ class AiAnthropic(BaseConnectorAi):
connectorType="anthropic",
apiUrl="https://api.anthropic.com/v1/messages",
temperature=0.2,
maxTokens=200000,
maxTokens=8192,
contextLength=200000,
costPer1kTokensInput=0.015,
costPer1kTokensOutput=0.075,

View file

@ -720,22 +720,9 @@ class DatabaseConnector:
logger.info(f"Initial ID {initialId} for table {table} registered")
return success
else:
# Check if the existing initial ID still exists in the table
existingInitialId = systemData[table]
records = self.getRecordset(
model_class, recordFilter={"id": existingInitialId}
)
if not records:
# The initial record no longer exists, update to the new one
systemData[table] = initialId
success = self._saveSystemTable(systemData)
if success:
logger.info(
f"Initial ID updated from {existingInitialId} to {initialId} for table {table}"
)
return success
else:
return True
# Table already has an initial ID registered
logger.debug(f"Table {table} already has initial ID {systemData[table]}")
return True
except Exception as e:
logger.error(f"Error registering the initial ID for table {table}: {e}")
return False

View file

@ -135,7 +135,6 @@ class AiCallOptions(BaseModel):
compressPrompt: bool = Field(default=True, description="Whether to compress the prompt")
compressContext: bool = Field(default=True, description="If False: process each chunk; If True: summarize and work on summary")
processDocumentsIndividually: bool = Field(default=True, description="If True, process each document separately; else pool docs")
maxContextBytes: Optional[int] = Field(default=None, description="Hard cap for extracted context size passed to the model")
maxCost: Optional[float] = Field(default=None, description="Max cost budget")
maxProcessingTime: Optional[int] = Field(default=None, description="Max processing time in seconds")
processingMode: ProcessingModeEnum = Field(default=ProcessingModeEnum.BASIC, description="Processing mode")
@ -145,7 +144,6 @@ class AiCallOptions(BaseModel):
# Model generation parameters
temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0, description="Temperature for response generation (0.0-2.0, lower = more consistent)")
maxTokens: Optional[int] = Field(default=None, ge=1, le=32000, description="Maximum tokens in response")
maxParts: Optional[int] = Field(default=1000, ge=1, le=1000, description="Maximum number of continuation parts to fetch")
@ -170,32 +168,6 @@ class AiCallResponse(BaseModel):
errorCount: int = Field(default=0, description="0 for success, 1+ for errors")
class EnhancedAiCallOptions(AiCallOptions):
"""Enhanced options for improved document processing with chunk mapping."""
# Parallel processing
enableParallelProcessing: bool = Field(
default=True,
description="Enable parallel processing of chunks"
)
maxConcurrentChunks: int = Field(
default=5,
ge=1,
le=20,
description="Maximum number of chunks to process concurrently"
)
# Chunk mapping
preserveChunkMetadata: bool = Field(
default=True,
description="Preserve chunk metadata during processing"
)
chunkSeparator: str = Field(
default="\n\n---\n\n",
description="Separator between chunks in merged output"
)
class AiModelCall(BaseModel):
"""Standardized input for AI model calls."""

View file

@ -1,6 +1,9 @@
from typing import Any, Dict, List, Optional, Literal
from typing import Any, Dict, List, Optional, Literal, TYPE_CHECKING
from pydantic import BaseModel, Field
if TYPE_CHECKING:
from modules.datamodels.datamodelAi import OperationTypeEnum
class ContentPart(BaseModel):
id: str = Field(description="Unique content part identifier")
@ -40,106 +43,49 @@ class PartResult(BaseModel):
class MergeStrategy(BaseModel):
"""Strategy configuration for merging content parts and AI results."""
groupBy: str = Field(default="typeGroup", description="Field to group parts by (typeGroup, parentId, label, etc.)")
orderBy: str = Field(default="id", description="Field to order parts within groups (id, order, pageIndex, etc.)")
mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(default="concatenate", description="How to merge content within groups")
maxSize: Optional[int] = Field(default=None, description="Maximum size for merged content in bytes")
textMerge: Optional[Dict[str, Any]] = Field(default=None, description="Text-specific merge settings (separator, formatting, etc.)")
tableMerge: Optional[Dict[str, Any]] = Field(default=None, description="Table-specific merge settings (header handling, etc.)")
structureMerge: Optional[Dict[str, Any]] = Field(default=None, description="Structure-specific merge settings (hierarchy, etc.)")
aiResultMerge: Optional[Dict[str, Any]] = Field(default=None, description="AI result merging settings (prompt, context, etc.)")
preserveChunks: bool = Field(default=False, description="Whether to preserve individual chunks or merge them")
chunkSeparator: str = Field(default="\n\n---\n\n", description="Separator between chunks when merging")
preserveMetadata: bool = Field(default=True, description="Whether to preserve metadata from original parts")
metadataFields: Optional[List[str]] = Field(default=None, description="Specific metadata fields to preserve (None = all)")
onError: Literal["skip", "include", "fail"] = Field(default="skip", description="How to handle errors during merging")
validateContent: bool = Field(default=True, description="Whether to validate content before merging")
useIntelligentMerging: bool = Field(default=False, description="Whether to use intelligent token-aware merging")
prompt: Optional[str] = Field(default=None, description="Prompt for intelligent merging")
capabilities: Optional[Dict[str, Any]] = Field(default=None, description="Model capabilities for intelligent merging")
# Grouping configuration
groupBy: str = Field(
default="typeGroup",
description="Field to group parts by (typeGroup, parentId, label, etc.)"
)
# Ordering configuration
orderBy: str = Field(
default="id",
description="Field to order parts within groups (id, order, pageIndex, etc.)"
)
class ExtractionOptions(BaseModel):
"""Options for document extraction and processing with clear data structures."""
# Merge behavior
mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(
default="concatenate",
description="How to merge content within groups"
)
# Core extraction parameters
prompt: str = Field(description="Extraction prompt for AI processing")
operationType: 'OperationTypeEnum' = Field(description="Type of operation for AI processing")
processDocumentsIndividually: bool = Field(default=True, description="Process each document separately")
# Size limits
maxSize: Optional[int] = Field(
default=None,
description="Maximum size for merged content in bytes"
)
# Image processing parameters
imageMaxPixels: int = Field(default=1024 * 1024, ge=1, description="Maximum pixels for image processing")
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
# Type-specific merge settings
textMerge: Optional[Dict[str, Any]] = Field(
default=None,
description="Text-specific merge settings (separator, formatting, etc.)"
)
# Merging strategy
mergeStrategy: MergeStrategy = Field(description="Strategy for merging extraction results")
tableMerge: Optional[Dict[str, Any]] = Field(
default=None,
description="Table-specific merge settings (header handling, etc.)"
)
# Optional chunking parameters (for backward compatibility)
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")
maxSize: Optional[int] = Field(default=None, description="Maximum size for processing")
textChunkSize: Optional[int] = Field(default=None, description="Size for text chunks")
imageChunkSize: Optional[int] = Field(default=None, description="Size for image chunks")
structureMerge: Optional[Dict[str, Any]] = Field(
default=None,
description="Structure-specific merge settings (hierarchy, etc.)"
)
# AI result merging
aiResultMerge: Optional[Dict[str, Any]] = Field(
default=None,
description="AI result merging settings (prompt, context, etc.)"
)
# Chunk handling
preserveChunks: bool = Field(
default=False,
description="Whether to preserve individual chunks or merge them"
)
chunkSeparator: str = Field(
default="\n\n---\n\n",
description="Separator between chunks when merging"
)
# Metadata handling
preserveMetadata: bool = Field(
default=True,
description="Whether to preserve metadata from original parts"
)
metadataFields: Optional[List[str]] = Field(
default=None,
description="Specific metadata fields to preserve (None = all)"
)
# Error handling
onError: Literal["skip", "include", "fail"] = Field(
default="skip",
description="How to handle errors during merging"
)
# Validation
validateContent: bool = Field(
default=True,
description="Whether to validate content before merging"
)
def getTypeSpecificSettings(self, typeGroup: str) -> Dict[str, Any]:
"""Get type-specific merge settings for a content type."""
if typeGroup == "text" and self.textMerge:
return self.textMerge
elif typeGroup == "table" and self.tableMerge:
return self.tableMerge
elif typeGroup == "structure" and self.structureMerge:
return self.structureMerge
else:
return {}
def shouldPreserveChunk(self, chunk: Dict[str, Any]) -> bool:
"""Determine if a chunk should be preserved based on strategy."""
if not self.preserveChunks:
return False
# Check if chunk has error metadata
if self.onError == "skip" and chunk.get("metadata", {}).get("error"):
return False
return True
# Additional processing options
enableParallelProcessing: bool = Field(default=True, description="Enable parallel processing of chunks")
maxConcurrentChunks: int = Field(default=5, ge=1, le=20, description="Maximum number of chunks to process concurrently")
class Config:
arbitraryTypesAllowed = True # Allow OperationTypeEnum import

View file

@ -127,7 +127,7 @@ class AiObjects:
logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
# Call the model
response = await self._callWithModel(model, prompt, context)
response = await self._callWithModel(model, prompt, context, options)
logger.info(f"✅ AI call successful with model: {model.name}")
return response
@ -204,7 +204,7 @@ class AiObjects:
if partSize <= modelContextBytes:
# Part fits - call AI directly
response = await self._callWithModel(model, prompt, contentPart.data)
response = await self._callWithModel(model, prompt, contentPart.data, options)
logger.info(f"✅ Content part processed successfully with model: {model.name}")
return response
else:
@ -216,7 +216,7 @@ class AiObjects:
# Process each chunk
chunkResults = []
for chunk in chunks:
chunkResponse = await self._callWithModel(model, prompt, chunk['data'])
chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
chunkResults.append(chunkResponse)
# Merge chunk results
@ -393,7 +393,7 @@ class AiObjects:
errorCount=1
)
async def _callWithModel(self, model: AiModel, prompt: str, context: str) -> AiCallResponse:
async def _callWithModel(self, model: AiModel, prompt: str, context: str, options: AiCallOptions = None) -> AiCallResponse:
"""Call a specific model and return the response."""
# Calculate input bytes from prompt and context
inputBytes = len((prompt + context).encode('utf-8'))
@ -430,7 +430,8 @@ class AiObjects:
# Create standardized call object
modelCall = AiModelCall(
messages=messages,
model=model
model=model,
options=options or {}
)
# Call the model with standardized interface

View file

@ -873,7 +873,7 @@ class ChatObjects:
stat = ChatStat(**statData)
# Create the stat record in the database
created = self.db.recordCreate(ChatStat, stat.model_dump())
created = self.db.recordCreate(ChatStat, stat)
# Return the created ChatStat
return ChatStat(**created)
@ -937,7 +937,7 @@ class ChatObjects:
items.append({
"type": "message",
"createdAt": msg_timestamp,
"item": chat_message.model_dump()
"item": chat_message
})
# Get logs
@ -952,7 +952,7 @@ class ChatObjects:
items.append({
"type": "log",
"createdAt": log_timestamp,
"item": chat_log.model_dump()
"item": chat_log
})
# Get stats list
@ -966,7 +966,7 @@ class ChatObjects:
items.append({
"type": "stat",
"createdAt": stat_timestamp,
"item": stat.model_dump()
"item": stat
})
# Sort all items by createdAt timestamp for chronological order

View file

@ -48,11 +48,8 @@ async def create_prompt(
"""Create a new prompt"""
managementInterface = interfaceDbComponentObjects.getInterface(currentUser)
# Convert Prompt to dict for interface
prompt_data = prompt.model_dump()
# Create prompt
newPrompt = managementInterface.createPrompt(prompt_data)
newPrompt = managementInterface.createPrompt(prompt)
return Prompt(**newPrompt)

View file

@ -92,11 +92,8 @@ async def create_user(
"""Create a new user"""
appInterface = interfaceDbAppObjects.getInterface(currentUser)
# Convert User to dict for interface
user_dict = user_data.model_dump()
# Create user
newUser = appInterface.createUser(user_dict)
newUser = appInterface.createUser(user_data)
return newUser
@ -119,11 +116,8 @@ async def update_user(
detail=f"User with ID {userId} not found"
)
# Convert User to dict for interface
update_data = userData.model_dump()
# Update user
updatedUser = appInterface.updateUser(userId, update_data)
updatedUser = appInterface.updateUser(userId, userData)
if not updatedUser:
raise HTTPException(

View file

@ -5,7 +5,7 @@ import time
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult
from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult, ExtractionOptions, MergeStrategy
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
logger = logging.getLogger(__name__)
@ -33,19 +33,6 @@ class SubDocumentProcessing:
self._extractionService = ExtractionService(self.services)
return self._extractionService
def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int:
"""Calculate maximum context bytes based on model capabilities and options."""
if options and options.maxContextBytes:
return options.maxContextBytes
# Default model capabilities (this should be enhanced with actual model registry)
defaultMaxTokens = 4000
safetyMargin = options.safetyMargin if options else 0.1
# Calculate bytes (4 chars per token estimation)
maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4)
return maxContextBytes
async def processDocumentsPerChunk(
self,
@ -68,22 +55,23 @@ class SubDocumentProcessing:
if not documents:
return ""
# Build extraction options WITHOUT chunking parameters
extractionOptions: Dict[str, Any] = {
"prompt": prompt,
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
"processDocumentsIndividually": True,
# REMOVED: maxSize, textChunkSize, imageChunkSize
"mergeStrategy": {
"useIntelligentMerging": True,
"prompt": prompt,
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
# Build extraction options using Pydantic model
mergeStrategy = MergeStrategy(
useIntelligentMerging=True,
prompt=prompt,
groupBy="typeGroup",
orderBy="id",
mergeType="concatenate"
)
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
extractionOptions = ExtractionOptions(
prompt=prompt,
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
processDocumentsIndividually=True,
mergeStrategy=mergeStrategy
)
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
try:
# Extract content WITHOUT chunking
@ -120,21 +108,23 @@ class SubDocumentProcessing:
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
# Build extraction options WITHOUT chunking parameters
extractionOptions: Dict[str, Any] = {
"prompt": prompt,
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
"processDocumentsIndividually": True,
"mergeStrategy": {
"useIntelligentMerging": True,
"prompt": prompt,
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
# Build extraction options using Pydantic model
mergeStrategy = MergeStrategy(
useIntelligentMerging=True,
prompt=prompt,
groupBy="typeGroup",
orderBy="id",
mergeType="concatenate"
)
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
extractionOptions = ExtractionOptions(
prompt=prompt,
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
processDocumentsIndividually=True,
mergeStrategy=mergeStrategy
)
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
try:
# Extract content WITHOUT chunking
@ -205,31 +195,25 @@ class SubDocumentProcessing:
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
# Get model capabilities for size calculation
model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options)
# Build extraction options using Pydantic model (model-aware chunking in AI call phase)
mergeStrategy = MergeStrategy(
useIntelligentMerging=True,
prompt=custom_prompt,
groupBy="typeGroup",
orderBy="id",
mergeType="concatenate"
)
# Build extraction options for chunking with intelligent merging
extractionOptions: Dict[str, Any] = {
"prompt": custom_prompt, # Use the custom prompt instead of default
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
"processDocumentsIndividually": True, # Process each document separately
"maxSize": model_capabilities["maxContextBytes"],
"chunkAllowed": True,
"textChunkSize": model_capabilities["textChunkSize"],
"imageChunkSize": model_capabilities["imageChunkSize"],
"imageMaxPixels": 1024 * 1024,
"imageQuality": 85,
"mergeStrategy": {
"useIntelligentMerging": True, # Enable intelligent token-aware merging
"capabilities": model_capabilities,
"prompt": custom_prompt, # Use the custom prompt
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
extractionOptions = ExtractionOptions(
prompt=custom_prompt, # Use the custom prompt instead of default
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
processDocumentsIndividually=True, # Process each document separately
imageMaxPixels=1024 * 1024,
imageQuality=85,
mergeStrategy=mergeStrategy
)
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
try:
# Extract content with chunking
@ -1042,15 +1026,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part)
# Use existing merging strategy from options
merge_strategy = {
"useIntelligentMerging": True,
"groupBy": "documentId", # Group by document
"orderBy": "partIndex", # Order by part index
"mergeType": "concatenate"
}
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="documentId", # Group by document
orderBy="partIndex", # Order by part index
mergeType="concatenate"
)
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1095,15 +1077,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part)
# Use existing merging strategy for JSON mode
merge_strategy = {
"useIntelligentMerging": True,
"groupBy": "documentId", # Group by document
"orderBy": "partIndex", # Order by part index
"mergeType": "concatenate"
}
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="documentId", # Group by document
orderBy="partIndex", # Order by part index
mergeType="concatenate"
)
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1234,15 +1214,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part)
# Use existing merging strategy from options
merge_strategy = {
"useIntelligentMerging": True,
"groupBy": "documentId", # Group by document
"orderBy": "chunkIndex", # Order by chunk index
"mergeType": "concatenate"
}
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="documentId", # Group by document
orderBy="chunkIndex", # Order by chunk index
mergeType="concatenate"
)
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1297,15 +1275,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part)
# Use existing merging strategy for clean mode
merge_strategy = {
"useIntelligentMerging": True,
"groupBy": "documentId", # Group by document
"orderBy": "chunkIndex", # Order by chunk index
"mergeType": "concatenate"
}
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="documentId", # Group by document
orderBy="chunkIndex", # Order by chunk index
mergeType="concatenate"
)
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1351,15 +1327,13 @@ CONTINUATION INSTRUCTIONS:
content_parts.append(content_part)
# Use existing merging strategy for JSON mode
merge_strategy = {
"useIntelligentMerging": True,
"groupBy": "documentId", # Group by document
"orderBy": "chunkIndex", # Order by chunk index
"mergeType": "concatenate"
}
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="documentId", # Group by document
orderBy="chunkIndex", # Order by chunk index
mergeType="concatenate"
)
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
@ -1455,5 +1429,3 @@ CONTINUATION INSTRUCTIONS:
logger.info(f"Merged {len(chunkResults)} chunks using existing sophisticated merging system (JSON mode)")
return merged_document
# REMOVED: _getModelCapabilitiesForContent method - no longer needed with model-aware chunking

View file

@ -5,7 +5,7 @@ import time
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallResponse
from modules.aicore.aicoreModelRegistry import modelRegistry
@ -20,7 +20,7 @@ class ExtractionService:
self._extractorRegistry = ExtractorRegistry()
self._chunkerRegistry = ChunkerRegistry()
def extractContent(self, documents: List[ChatDocument], options: Dict[str, Any]) -> List[ContentExtracted]:
def extractContent(self, documents: List[ChatDocument], options: ExtractionOptions) -> List[ContentExtracted]:
"""
Extract content from a list of ChatDocument objects.
@ -31,6 +31,7 @@ class ExtractionService:
Returns:
List of ContentExtracted objects, one per input document
"""
results: List[ContentExtracted] = []
# Lazy import to avoid circular deps and heavy init at module import

View file

@ -1,9 +1,9 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
class DefaultMerger:
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""
Default merger that passes through parts unchanged.
Used for image, binary, metadata, container typeGroups.

View file

@ -1,10 +1,10 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
from ..subUtils import makeId
class TableMerger:
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""
Merge table parts based on strategy.
Strategy options:
@ -15,9 +15,9 @@ class TableMerger:
if not parts:
return parts
groupBy = strategy.get("groupBy", "parentId")
maxSize = strategy.get("maxSize", 0)
combineSheets = strategy.get("combineSheets", False)
groupBy = strategy.groupBy
maxSize = strategy.maxSize or 0
combineSheets = strategy.tableMerge.get("combineSheets", False) if strategy.tableMerge else False
# Group parts
groups = self._groupParts(parts, groupBy, combineSheets)

View file

@ -1,10 +1,10 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
from ..subUtils import makeId
class TextMerger:
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""
Merge text parts based on strategy.
Strategy options:
@ -15,9 +15,9 @@ class TextMerger:
if not parts:
return parts
groupBy = strategy.get("groupBy", "parentId")
orderBy = strategy.get("orderBy", "label")
maxSize = strategy.get("maxSize", 0)
groupBy = strategy.groupBy
orderBy = strategy.orderBy
maxSize = strategy.maxSize or 0
# Group parts
groups = self._groupParts(parts, groupBy)

View file

@ -1,8 +1,7 @@
from typing import Any, Dict, List
from typing import List
import logging
import os
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy
from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .merging.mergerText import TextMerger
@ -13,13 +12,13 @@ from .subMerger import IntelligentTokenAwareMerger
logger = logging.getLogger(__name__)
def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List[ContentPart]:
def _mergeParts(parts: List[ContentPart], mergeStrategy: MergeStrategy) -> List[ContentPart]:
"""Merge parts based on the provided strategy."""
if not parts or not mergeStrategy:
return parts
groupBy = mergeStrategy.get("groupBy", "typeGroup")
orderBy = mergeStrategy.get("orderBy", "id")
groupBy = mergeStrategy.groupBy
orderBy = mergeStrategy.orderBy
# Group parts by the specified field
groups = {}
@ -56,7 +55,8 @@ def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List
return merged_parts
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: Dict[str, Any]) -> ContentExtracted:
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted:
extractor = extractorRegistry.resolve(mimeType, fileName)
if extractor is None:
# fallback: single binary part
@ -71,15 +71,14 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
)
return ContentExtracted(id=makeId(), parts=[part])
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options})
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType})
# REMOVED: poolAndLimit(parts, chunkerRegistry, options)
# REMOVED: Chunking logic - now handled in AI call phase
# Apply merging strategy if provided (preserve existing logic)
mergeStrategy = options.get("mergeStrategy", {})
if mergeStrategy:
parts = _applyMerging(parts, mergeStrategy)
if options.mergeStrategy:
parts = _applyMerging(parts, options.mergeStrategy)
return ContentExtracted(id=makeId(), parts=parts)
@ -87,17 +86,17 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
def _applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""Apply merging strategy to parts with intelligent token-aware merging."""
logger.debug(f"_applyMerging called with {len(parts)} parts")
# Check if intelligent merging is enabled
if strategy.get("useIntelligentMerging", False):
model_capabilities = strategy.get("capabilities", {})
if strategy.useIntelligentMerging:
model_capabilities = strategy.capabilities or {}
subMerger = IntelligentTokenAwareMerger(model_capabilities)
# Use intelligent merging for all parts
merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
merged = subMerger.merge_chunks_intelligently(parts, strategy.prompt or "")
# Calculate and log optimization stats
stats = subMerger.calculate_optimization_stats(parts, merged)

View file

@ -90,7 +90,7 @@ class ModelSelectionTester:
totalScore = sizeRating + processingModeRating + priorityRating
print(
f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
)
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
@ -136,7 +136,7 @@ class ModelSelectionTester:
totalScore = sizeRating + processingModeRating + priorityRating
print(
f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
)
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
@ -365,8 +365,8 @@ class ModelSelectionTester:
)
if failoverModelList:
print(f" Selected model: {failoverModelList[0].name}")
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
print(f" Selected model: {failoverModelList[0].displayName}")
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else:
print(" No suitable models found")
@ -393,8 +393,8 @@ class ModelSelectionTester:
)
if failoverModelList:
print(f" Selected model: {failoverModelList[0].name}")
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
print(f" Selected model: {failoverModelList[0].displayName}")
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else:
print(" No suitable models found")
@ -421,8 +421,8 @@ class ModelSelectionTester:
)
if failoverModelList:
print(f" Selected model: {failoverModelList[0].name}")
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
print(f" Selected model: {failoverModelList[0].displayName}")
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else:
print(" No suitable models found")
@ -449,8 +449,8 @@ class ModelSelectionTester:
)
if failoverModelList:
print(f" Selected model: {failoverModelList[0].name}")
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
print(f" Selected model: {failoverModelList[0].displayName}")
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
else:
print(" No suitable models found")
@ -479,15 +479,15 @@ class ModelSelectionTester:
print(f" {connector_type}: {len(models)} models")
for model in models:
capabilities = getattr(model, 'capabilities', [])
print(f" - {model.name}: {capabilities}")
print(f" - {model.displayName}: {capabilities}")
# Show operation type support
print(f"\nOperation type support:")
for op_type in OperationTypeEnum:
supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and op_type in m.operationTypes]
supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and any(ot.operationType == op_type for ot in m.operationTypes)]
print(f" {op_type.name}: {len(supported_models)} models")
if supported_models:
model_names = [m.name for m in supported_models[:3]] # Show first 3 models
model_names = [m.displayName for m in supported_models[:3]] # Show first 3 models
print(f" Models: {', '.join(model_names)}")