cleanup pydantic v2, unnecessary pdantic to dict convesrions, unnecessary unions removed with clean classes
This commit is contained in:
parent
4f7bba5f33
commit
8523da7fe2
15 changed files with 188 additions and 319 deletions
|
|
@ -50,7 +50,7 @@ class AiAnthropic(BaseConnectorAi):
|
|||
connectorType="anthropic",
|
||||
apiUrl="https://api.anthropic.com/v1/messages",
|
||||
temperature=0.2,
|
||||
maxTokens=200000,
|
||||
maxTokens=8192,
|
||||
contextLength=200000,
|
||||
costPer1kTokensInput=0.015,
|
||||
costPer1kTokensOutput=0.075,
|
||||
|
|
@ -75,7 +75,7 @@ class AiAnthropic(BaseConnectorAi):
|
|||
connectorType="anthropic",
|
||||
apiUrl="https://api.anthropic.com/v1/messages",
|
||||
temperature=0.2,
|
||||
maxTokens=200000,
|
||||
maxTokens=8192,
|
||||
contextLength=200000,
|
||||
costPer1kTokensInput=0.015,
|
||||
costPer1kTokensOutput=0.075,
|
||||
|
|
|
|||
|
|
@ -720,22 +720,9 @@ class DatabaseConnector:
|
|||
logger.info(f"Initial ID {initialId} for table {table} registered")
|
||||
return success
|
||||
else:
|
||||
# Check if the existing initial ID still exists in the table
|
||||
existingInitialId = systemData[table]
|
||||
records = self.getRecordset(
|
||||
model_class, recordFilter={"id": existingInitialId}
|
||||
)
|
||||
if not records:
|
||||
# The initial record no longer exists, update to the new one
|
||||
systemData[table] = initialId
|
||||
success = self._saveSystemTable(systemData)
|
||||
if success:
|
||||
logger.info(
|
||||
f"Initial ID updated from {existingInitialId} to {initialId} for table {table}"
|
||||
)
|
||||
return success
|
||||
else:
|
||||
return True
|
||||
# Table already has an initial ID registered
|
||||
logger.debug(f"Table {table} already has initial ID {systemData[table]}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error registering the initial ID for table {table}: {e}")
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -135,7 +135,6 @@ class AiCallOptions(BaseModel):
|
|||
compressPrompt: bool = Field(default=True, description="Whether to compress the prompt")
|
||||
compressContext: bool = Field(default=True, description="If False: process each chunk; If True: summarize and work on summary")
|
||||
processDocumentsIndividually: bool = Field(default=True, description="If True, process each document separately; else pool docs")
|
||||
maxContextBytes: Optional[int] = Field(default=None, description="Hard cap for extracted context size passed to the model")
|
||||
maxCost: Optional[float] = Field(default=None, description="Max cost budget")
|
||||
maxProcessingTime: Optional[int] = Field(default=None, description="Max processing time in seconds")
|
||||
processingMode: ProcessingModeEnum = Field(default=ProcessingModeEnum.BASIC, description="Processing mode")
|
||||
|
|
@ -145,7 +144,6 @@ class AiCallOptions(BaseModel):
|
|||
|
||||
# Model generation parameters
|
||||
temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0, description="Temperature for response generation (0.0-2.0, lower = more consistent)")
|
||||
maxTokens: Optional[int] = Field(default=None, ge=1, le=32000, description="Maximum tokens in response")
|
||||
maxParts: Optional[int] = Field(default=1000, ge=1, le=1000, description="Maximum number of continuation parts to fetch")
|
||||
|
||||
|
||||
|
|
@ -170,32 +168,6 @@ class AiCallResponse(BaseModel):
|
|||
errorCount: int = Field(default=0, description="0 for success, 1+ for errors")
|
||||
|
||||
|
||||
class EnhancedAiCallOptions(AiCallOptions):
|
||||
"""Enhanced options for improved document processing with chunk mapping."""
|
||||
|
||||
# Parallel processing
|
||||
enableParallelProcessing: bool = Field(
|
||||
default=True,
|
||||
description="Enable parallel processing of chunks"
|
||||
)
|
||||
maxConcurrentChunks: int = Field(
|
||||
default=5,
|
||||
ge=1,
|
||||
le=20,
|
||||
description="Maximum number of chunks to process concurrently"
|
||||
)
|
||||
|
||||
# Chunk mapping
|
||||
preserveChunkMetadata: bool = Field(
|
||||
default=True,
|
||||
description="Preserve chunk metadata during processing"
|
||||
)
|
||||
chunkSeparator: str = Field(
|
||||
default="\n\n---\n\n",
|
||||
description="Separator between chunks in merged output"
|
||||
)
|
||||
|
||||
|
||||
class AiModelCall(BaseModel):
|
||||
"""Standardized input for AI model calls."""
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
from typing import Any, Dict, List, Optional, Literal
|
||||
from typing import Any, Dict, List, Optional, Literal, TYPE_CHECKING
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from modules.datamodels.datamodelAi import OperationTypeEnum
|
||||
|
||||
|
||||
class ContentPart(BaseModel):
|
||||
id: str = Field(description="Unique content part identifier")
|
||||
|
|
@ -40,106 +43,49 @@ class PartResult(BaseModel):
|
|||
|
||||
class MergeStrategy(BaseModel):
|
||||
"""Strategy configuration for merging content parts and AI results."""
|
||||
groupBy: str = Field(default="typeGroup", description="Field to group parts by (typeGroup, parentId, label, etc.)")
|
||||
orderBy: str = Field(default="id", description="Field to order parts within groups (id, order, pageIndex, etc.)")
|
||||
mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(default="concatenate", description="How to merge content within groups")
|
||||
maxSize: Optional[int] = Field(default=None, description="Maximum size for merged content in bytes")
|
||||
textMerge: Optional[Dict[str, Any]] = Field(default=None, description="Text-specific merge settings (separator, formatting, etc.)")
|
||||
tableMerge: Optional[Dict[str, Any]] = Field(default=None, description="Table-specific merge settings (header handling, etc.)")
|
||||
structureMerge: Optional[Dict[str, Any]] = Field(default=None, description="Structure-specific merge settings (hierarchy, etc.)")
|
||||
aiResultMerge: Optional[Dict[str, Any]] = Field(default=None, description="AI result merging settings (prompt, context, etc.)")
|
||||
preserveChunks: bool = Field(default=False, description="Whether to preserve individual chunks or merge them")
|
||||
chunkSeparator: str = Field(default="\n\n---\n\n", description="Separator between chunks when merging")
|
||||
preserveMetadata: bool = Field(default=True, description="Whether to preserve metadata from original parts")
|
||||
metadataFields: Optional[List[str]] = Field(default=None, description="Specific metadata fields to preserve (None = all)")
|
||||
onError: Literal["skip", "include", "fail"] = Field(default="skip", description="How to handle errors during merging")
|
||||
validateContent: bool = Field(default=True, description="Whether to validate content before merging")
|
||||
useIntelligentMerging: bool = Field(default=False, description="Whether to use intelligent token-aware merging")
|
||||
prompt: Optional[str] = Field(default=None, description="Prompt for intelligent merging")
|
||||
capabilities: Optional[Dict[str, Any]] = Field(default=None, description="Model capabilities for intelligent merging")
|
||||
|
||||
# Grouping configuration
|
||||
groupBy: str = Field(
|
||||
default="typeGroup",
|
||||
description="Field to group parts by (typeGroup, parentId, label, etc.)"
|
||||
)
|
||||
|
||||
# Ordering configuration
|
||||
orderBy: str = Field(
|
||||
default="id",
|
||||
description="Field to order parts within groups (id, order, pageIndex, etc.)"
|
||||
)
|
||||
class ExtractionOptions(BaseModel):
|
||||
"""Options for document extraction and processing with clear data structures."""
|
||||
|
||||
# Merge behavior
|
||||
mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(
|
||||
default="concatenate",
|
||||
description="How to merge content within groups"
|
||||
)
|
||||
# Core extraction parameters
|
||||
prompt: str = Field(description="Extraction prompt for AI processing")
|
||||
operationType: 'OperationTypeEnum' = Field(description="Type of operation for AI processing")
|
||||
processDocumentsIndividually: bool = Field(default=True, description="Process each document separately")
|
||||
|
||||
# Size limits
|
||||
maxSize: Optional[int] = Field(
|
||||
default=None,
|
||||
description="Maximum size for merged content in bytes"
|
||||
)
|
||||
# Image processing parameters
|
||||
imageMaxPixels: int = Field(default=1024 * 1024, ge=1, description="Maximum pixels for image processing")
|
||||
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
|
||||
|
||||
# Type-specific merge settings
|
||||
textMerge: Optional[Dict[str, Any]] = Field(
|
||||
default=None,
|
||||
description="Text-specific merge settings (separator, formatting, etc.)"
|
||||
)
|
||||
# Merging strategy
|
||||
mergeStrategy: MergeStrategy = Field(description="Strategy for merging extraction results")
|
||||
|
||||
tableMerge: Optional[Dict[str, Any]] = Field(
|
||||
default=None,
|
||||
description="Table-specific merge settings (header handling, etc.)"
|
||||
)
|
||||
# Optional chunking parameters (for backward compatibility)
|
||||
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")
|
||||
maxSize: Optional[int] = Field(default=None, description="Maximum size for processing")
|
||||
textChunkSize: Optional[int] = Field(default=None, description="Size for text chunks")
|
||||
imageChunkSize: Optional[int] = Field(default=None, description="Size for image chunks")
|
||||
|
||||
structureMerge: Optional[Dict[str, Any]] = Field(
|
||||
default=None,
|
||||
description="Structure-specific merge settings (hierarchy, etc.)"
|
||||
)
|
||||
|
||||
# AI result merging
|
||||
aiResultMerge: Optional[Dict[str, Any]] = Field(
|
||||
default=None,
|
||||
description="AI result merging settings (prompt, context, etc.)"
|
||||
)
|
||||
|
||||
# Chunk handling
|
||||
preserveChunks: bool = Field(
|
||||
default=False,
|
||||
description="Whether to preserve individual chunks or merge them"
|
||||
)
|
||||
|
||||
chunkSeparator: str = Field(
|
||||
default="\n\n---\n\n",
|
||||
description="Separator between chunks when merging"
|
||||
)
|
||||
|
||||
# Metadata handling
|
||||
preserveMetadata: bool = Field(
|
||||
default=True,
|
||||
description="Whether to preserve metadata from original parts"
|
||||
)
|
||||
|
||||
metadataFields: Optional[List[str]] = Field(
|
||||
default=None,
|
||||
description="Specific metadata fields to preserve (None = all)"
|
||||
)
|
||||
|
||||
# Error handling
|
||||
onError: Literal["skip", "include", "fail"] = Field(
|
||||
default="skip",
|
||||
description="How to handle errors during merging"
|
||||
)
|
||||
|
||||
# Validation
|
||||
validateContent: bool = Field(
|
||||
default=True,
|
||||
description="Whether to validate content before merging"
|
||||
)
|
||||
|
||||
def getTypeSpecificSettings(self, typeGroup: str) -> Dict[str, Any]:
|
||||
"""Get type-specific merge settings for a content type."""
|
||||
if typeGroup == "text" and self.textMerge:
|
||||
return self.textMerge
|
||||
elif typeGroup == "table" and self.tableMerge:
|
||||
return self.tableMerge
|
||||
elif typeGroup == "structure" and self.structureMerge:
|
||||
return self.structureMerge
|
||||
else:
|
||||
return {}
|
||||
|
||||
def shouldPreserveChunk(self, chunk: Dict[str, Any]) -> bool:
|
||||
"""Determine if a chunk should be preserved based on strategy."""
|
||||
if not self.preserveChunks:
|
||||
return False
|
||||
|
||||
# Check if chunk has error metadata
|
||||
if self.onError == "skip" and chunk.get("metadata", {}).get("error"):
|
||||
return False
|
||||
|
||||
return True
|
||||
# Additional processing options
|
||||
enableParallelProcessing: bool = Field(default=True, description="Enable parallel processing of chunks")
|
||||
maxConcurrentChunks: int = Field(default=5, ge=1, le=20, description="Maximum number of chunks to process concurrently")
|
||||
|
||||
class Config:
|
||||
arbitraryTypesAllowed = True # Allow OperationTypeEnum import
|
||||
|
|
|
|||
|
|
@ -127,7 +127,7 @@ class AiObjects:
|
|||
logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
|
||||
|
||||
# Call the model
|
||||
response = await self._callWithModel(model, prompt, context)
|
||||
response = await self._callWithModel(model, prompt, context, options)
|
||||
|
||||
logger.info(f"✅ AI call successful with model: {model.name}")
|
||||
return response
|
||||
|
|
@ -204,7 +204,7 @@ class AiObjects:
|
|||
|
||||
if partSize <= modelContextBytes:
|
||||
# Part fits - call AI directly
|
||||
response = await self._callWithModel(model, prompt, contentPart.data)
|
||||
response = await self._callWithModel(model, prompt, contentPart.data, options)
|
||||
logger.info(f"✅ Content part processed successfully with model: {model.name}")
|
||||
return response
|
||||
else:
|
||||
|
|
@ -216,7 +216,7 @@ class AiObjects:
|
|||
# Process each chunk
|
||||
chunkResults = []
|
||||
for chunk in chunks:
|
||||
chunkResponse = await self._callWithModel(model, prompt, chunk['data'])
|
||||
chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
|
||||
chunkResults.append(chunkResponse)
|
||||
|
||||
# Merge chunk results
|
||||
|
|
@ -393,7 +393,7 @@ class AiObjects:
|
|||
errorCount=1
|
||||
)
|
||||
|
||||
async def _callWithModel(self, model: AiModel, prompt: str, context: str) -> AiCallResponse:
|
||||
async def _callWithModel(self, model: AiModel, prompt: str, context: str, options: AiCallOptions = None) -> AiCallResponse:
|
||||
"""Call a specific model and return the response."""
|
||||
# Calculate input bytes from prompt and context
|
||||
inputBytes = len((prompt + context).encode('utf-8'))
|
||||
|
|
@ -430,7 +430,8 @@ class AiObjects:
|
|||
# Create standardized call object
|
||||
modelCall = AiModelCall(
|
||||
messages=messages,
|
||||
model=model
|
||||
model=model,
|
||||
options=options or {}
|
||||
)
|
||||
|
||||
# Call the model with standardized interface
|
||||
|
|
|
|||
|
|
@ -873,7 +873,7 @@ class ChatObjects:
|
|||
stat = ChatStat(**statData)
|
||||
|
||||
# Create the stat record in the database
|
||||
created = self.db.recordCreate(ChatStat, stat.model_dump())
|
||||
created = self.db.recordCreate(ChatStat, stat)
|
||||
|
||||
# Return the created ChatStat
|
||||
return ChatStat(**created)
|
||||
|
|
@ -937,7 +937,7 @@ class ChatObjects:
|
|||
items.append({
|
||||
"type": "message",
|
||||
"createdAt": msg_timestamp,
|
||||
"item": chat_message.model_dump()
|
||||
"item": chat_message
|
||||
})
|
||||
|
||||
# Get logs
|
||||
|
|
@ -952,7 +952,7 @@ class ChatObjects:
|
|||
items.append({
|
||||
"type": "log",
|
||||
"createdAt": log_timestamp,
|
||||
"item": chat_log.model_dump()
|
||||
"item": chat_log
|
||||
})
|
||||
|
||||
# Get stats list
|
||||
|
|
@ -966,7 +966,7 @@ class ChatObjects:
|
|||
items.append({
|
||||
"type": "stat",
|
||||
"createdAt": stat_timestamp,
|
||||
"item": stat.model_dump()
|
||||
"item": stat
|
||||
})
|
||||
|
||||
# Sort all items by createdAt timestamp for chronological order
|
||||
|
|
|
|||
|
|
@ -48,11 +48,8 @@ async def create_prompt(
|
|||
"""Create a new prompt"""
|
||||
managementInterface = interfaceDbComponentObjects.getInterface(currentUser)
|
||||
|
||||
# Convert Prompt to dict for interface
|
||||
prompt_data = prompt.model_dump()
|
||||
|
||||
# Create prompt
|
||||
newPrompt = managementInterface.createPrompt(prompt_data)
|
||||
newPrompt = managementInterface.createPrompt(prompt)
|
||||
|
||||
return Prompt(**newPrompt)
|
||||
|
||||
|
|
|
|||
|
|
@ -92,11 +92,8 @@ async def create_user(
|
|||
"""Create a new user"""
|
||||
appInterface = interfaceDbAppObjects.getInterface(currentUser)
|
||||
|
||||
# Convert User to dict for interface
|
||||
user_dict = user_data.model_dump()
|
||||
|
||||
# Create user
|
||||
newUser = appInterface.createUser(user_dict)
|
||||
newUser = appInterface.createUser(user_data)
|
||||
|
||||
return newUser
|
||||
|
||||
|
|
@ -119,11 +116,8 @@ async def update_user(
|
|||
detail=f"User with ID {userId} not found"
|
||||
)
|
||||
|
||||
# Convert User to dict for interface
|
||||
update_data = userData.model_dump()
|
||||
|
||||
# Update user
|
||||
updatedUser = appInterface.updateUser(userId, update_data)
|
||||
updatedUser = appInterface.updateUser(userId, userData)
|
||||
|
||||
if not updatedUser:
|
||||
raise HTTPException(
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ import time
|
|||
from typing import Dict, Any, List, Optional, Tuple, Union
|
||||
from modules.datamodels.datamodelChat import ChatDocument
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
|
||||
from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult
|
||||
from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult, ExtractionOptions, MergeStrategy
|
||||
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -33,19 +33,6 @@ class SubDocumentProcessing:
|
|||
self._extractionService = ExtractionService(self.services)
|
||||
return self._extractionService
|
||||
|
||||
def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int:
|
||||
"""Calculate maximum context bytes based on model capabilities and options."""
|
||||
if options and options.maxContextBytes:
|
||||
return options.maxContextBytes
|
||||
|
||||
# Default model capabilities (this should be enhanced with actual model registry)
|
||||
defaultMaxTokens = 4000
|
||||
safetyMargin = options.safetyMargin if options else 0.1
|
||||
|
||||
# Calculate bytes (4 chars per token estimation)
|
||||
maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4)
|
||||
|
||||
return maxContextBytes
|
||||
|
||||
async def processDocumentsPerChunk(
|
||||
self,
|
||||
|
|
@ -68,22 +55,23 @@ class SubDocumentProcessing:
|
|||
if not documents:
|
||||
return ""
|
||||
|
||||
# Build extraction options WITHOUT chunking parameters
|
||||
extractionOptions: Dict[str, Any] = {
|
||||
"prompt": prompt,
|
||||
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
||||
"processDocumentsIndividually": True,
|
||||
# REMOVED: maxSize, textChunkSize, imageChunkSize
|
||||
"mergeStrategy": {
|
||||
"useIntelligentMerging": True,
|
||||
"prompt": prompt,
|
||||
"groupBy": "typeGroup",
|
||||
"orderBy": "id",
|
||||
"mergeType": "concatenate"
|
||||
},
|
||||
}
|
||||
# Build extraction options using Pydantic model
|
||||
mergeStrategy = MergeStrategy(
|
||||
useIntelligentMerging=True,
|
||||
prompt=prompt,
|
||||
groupBy="typeGroup",
|
||||
orderBy="id",
|
||||
mergeType="concatenate"
|
||||
)
|
||||
|
||||
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
|
||||
extractionOptions = ExtractionOptions(
|
||||
prompt=prompt,
|
||||
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
||||
processDocumentsIndividually=True,
|
||||
mergeStrategy=mergeStrategy
|
||||
)
|
||||
|
||||
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
|
||||
|
||||
try:
|
||||
# Extract content WITHOUT chunking
|
||||
|
|
@ -120,21 +108,23 @@ class SubDocumentProcessing:
|
|||
if not documents:
|
||||
return {"metadata": {"title": "Empty Document"}, "sections": []}
|
||||
|
||||
# Build extraction options WITHOUT chunking parameters
|
||||
extractionOptions: Dict[str, Any] = {
|
||||
"prompt": prompt,
|
||||
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
||||
"processDocumentsIndividually": True,
|
||||
"mergeStrategy": {
|
||||
"useIntelligentMerging": True,
|
||||
"prompt": prompt,
|
||||
"groupBy": "typeGroup",
|
||||
"orderBy": "id",
|
||||
"mergeType": "concatenate"
|
||||
},
|
||||
}
|
||||
# Build extraction options using Pydantic model
|
||||
mergeStrategy = MergeStrategy(
|
||||
useIntelligentMerging=True,
|
||||
prompt=prompt,
|
||||
groupBy="typeGroup",
|
||||
orderBy="id",
|
||||
mergeType="concatenate"
|
||||
)
|
||||
|
||||
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
|
||||
extractionOptions = ExtractionOptions(
|
||||
prompt=prompt,
|
||||
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
||||
processDocumentsIndividually=True,
|
||||
mergeStrategy=mergeStrategy
|
||||
)
|
||||
|
||||
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
|
||||
|
||||
try:
|
||||
# Extract content WITHOUT chunking
|
||||
|
|
@ -205,31 +195,25 @@ class SubDocumentProcessing:
|
|||
if not documents:
|
||||
return {"metadata": {"title": "Empty Document"}, "sections": []}
|
||||
|
||||
# Get model capabilities for size calculation
|
||||
model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options)
|
||||
# Build extraction options using Pydantic model (model-aware chunking in AI call phase)
|
||||
mergeStrategy = MergeStrategy(
|
||||
useIntelligentMerging=True,
|
||||
prompt=custom_prompt,
|
||||
groupBy="typeGroup",
|
||||
orderBy="id",
|
||||
mergeType="concatenate"
|
||||
)
|
||||
|
||||
# Build extraction options for chunking with intelligent merging
|
||||
extractionOptions: Dict[str, Any] = {
|
||||
"prompt": custom_prompt, # Use the custom prompt instead of default
|
||||
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
||||
"processDocumentsIndividually": True, # Process each document separately
|
||||
"maxSize": model_capabilities["maxContextBytes"],
|
||||
"chunkAllowed": True,
|
||||
"textChunkSize": model_capabilities["textChunkSize"],
|
||||
"imageChunkSize": model_capabilities["imageChunkSize"],
|
||||
"imageMaxPixels": 1024 * 1024,
|
||||
"imageQuality": 85,
|
||||
"mergeStrategy": {
|
||||
"useIntelligentMerging": True, # Enable intelligent token-aware merging
|
||||
"capabilities": model_capabilities,
|
||||
"prompt": custom_prompt, # Use the custom prompt
|
||||
"groupBy": "typeGroup",
|
||||
"orderBy": "id",
|
||||
"mergeType": "concatenate"
|
||||
},
|
||||
}
|
||||
extractionOptions = ExtractionOptions(
|
||||
prompt=custom_prompt, # Use the custom prompt instead of default
|
||||
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
||||
processDocumentsIndividually=True, # Process each document separately
|
||||
imageMaxPixels=1024 * 1024,
|
||||
imageQuality=85,
|
||||
mergeStrategy=mergeStrategy
|
||||
)
|
||||
|
||||
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
|
||||
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
|
||||
|
||||
try:
|
||||
# Extract content with chunking
|
||||
|
|
@ -1042,15 +1026,13 @@ CONTINUATION INSTRUCTIONS:
|
|||
content_parts.append(content_part)
|
||||
|
||||
# Use existing merging strategy from options
|
||||
merge_strategy = {
|
||||
"useIntelligentMerging": True,
|
||||
"groupBy": "documentId", # Group by document
|
||||
"orderBy": "partIndex", # Order by part index
|
||||
"mergeType": "concatenate"
|
||||
}
|
||||
merge_strategy = MergeStrategy(
|
||||
useIntelligentMerging=True,
|
||||
groupBy="documentId", # Group by document
|
||||
orderBy="partIndex", # Order by part index
|
||||
mergeType="concatenate"
|
||||
)
|
||||
|
||||
if options and hasattr(options, 'mergeStrategy'):
|
||||
merge_strategy.update(options.mergeStrategy)
|
||||
|
||||
# Apply existing merging logic using the sophisticated merging system
|
||||
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
||||
|
|
@ -1095,15 +1077,13 @@ CONTINUATION INSTRUCTIONS:
|
|||
content_parts.append(content_part)
|
||||
|
||||
# Use existing merging strategy for JSON mode
|
||||
merge_strategy = {
|
||||
"useIntelligentMerging": True,
|
||||
"groupBy": "documentId", # Group by document
|
||||
"orderBy": "partIndex", # Order by part index
|
||||
"mergeType": "concatenate"
|
||||
}
|
||||
merge_strategy = MergeStrategy(
|
||||
useIntelligentMerging=True,
|
||||
groupBy="documentId", # Group by document
|
||||
orderBy="partIndex", # Order by part index
|
||||
mergeType="concatenate"
|
||||
)
|
||||
|
||||
if options and hasattr(options, 'mergeStrategy'):
|
||||
merge_strategy.update(options.mergeStrategy)
|
||||
|
||||
# Apply existing merging logic using the sophisticated merging system
|
||||
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
||||
|
|
@ -1234,15 +1214,13 @@ CONTINUATION INSTRUCTIONS:
|
|||
content_parts.append(content_part)
|
||||
|
||||
# Use existing merging strategy from options
|
||||
merge_strategy = {
|
||||
"useIntelligentMerging": True,
|
||||
"groupBy": "documentId", # Group by document
|
||||
"orderBy": "chunkIndex", # Order by chunk index
|
||||
"mergeType": "concatenate"
|
||||
}
|
||||
merge_strategy = MergeStrategy(
|
||||
useIntelligentMerging=True,
|
||||
groupBy="documentId", # Group by document
|
||||
orderBy="chunkIndex", # Order by chunk index
|
||||
mergeType="concatenate"
|
||||
)
|
||||
|
||||
if options and hasattr(options, 'mergeStrategy'):
|
||||
merge_strategy.update(options.mergeStrategy)
|
||||
|
||||
# Apply existing merging logic using the sophisticated merging system
|
||||
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
||||
|
|
@ -1297,15 +1275,13 @@ CONTINUATION INSTRUCTIONS:
|
|||
content_parts.append(content_part)
|
||||
|
||||
# Use existing merging strategy for clean mode
|
||||
merge_strategy = {
|
||||
"useIntelligentMerging": True,
|
||||
"groupBy": "documentId", # Group by document
|
||||
"orderBy": "chunkIndex", # Order by chunk index
|
||||
"mergeType": "concatenate"
|
||||
}
|
||||
merge_strategy = MergeStrategy(
|
||||
useIntelligentMerging=True,
|
||||
groupBy="documentId", # Group by document
|
||||
orderBy="chunkIndex", # Order by chunk index
|
||||
mergeType="concatenate"
|
||||
)
|
||||
|
||||
if options and hasattr(options, 'mergeStrategy'):
|
||||
merge_strategy.update(options.mergeStrategy)
|
||||
|
||||
# Apply existing merging logic using the sophisticated merging system
|
||||
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
||||
|
|
@ -1351,15 +1327,13 @@ CONTINUATION INSTRUCTIONS:
|
|||
content_parts.append(content_part)
|
||||
|
||||
# Use existing merging strategy for JSON mode
|
||||
merge_strategy = {
|
||||
"useIntelligentMerging": True,
|
||||
"groupBy": "documentId", # Group by document
|
||||
"orderBy": "chunkIndex", # Order by chunk index
|
||||
"mergeType": "concatenate"
|
||||
}
|
||||
merge_strategy = MergeStrategy(
|
||||
useIntelligentMerging=True,
|
||||
groupBy="documentId", # Group by document
|
||||
orderBy="chunkIndex", # Order by chunk index
|
||||
mergeType="concatenate"
|
||||
)
|
||||
|
||||
if options and hasattr(options, 'mergeStrategy'):
|
||||
merge_strategy.update(options.mergeStrategy)
|
||||
|
||||
# Apply existing merging logic using the sophisticated merging system
|
||||
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
||||
|
|
@ -1455,5 +1429,3 @@ CONTINUATION INSTRUCTIONS:
|
|||
|
||||
logger.info(f"Merged {len(chunkResults)} chunks using existing sophisticated merging system (JSON mode)")
|
||||
return merged_document
|
||||
|
||||
# REMOVED: _getModelCapabilitiesForContent method - no longer needed with model-aware chunking
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ import time
|
|||
|
||||
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
||||
from .subPipeline import runExtraction
|
||||
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy
|
||||
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions
|
||||
from modules.datamodels.datamodelChat import ChatDocument
|
||||
from modules.datamodels.datamodelAi import AiCallResponse
|
||||
from modules.aicore.aicoreModelRegistry import modelRegistry
|
||||
|
|
@ -20,7 +20,7 @@ class ExtractionService:
|
|||
self._extractorRegistry = ExtractorRegistry()
|
||||
self._chunkerRegistry = ChunkerRegistry()
|
||||
|
||||
def extractContent(self, documents: List[ChatDocument], options: Dict[str, Any]) -> List[ContentExtracted]:
|
||||
def extractContent(self, documents: List[ChatDocument], options: ExtractionOptions) -> List[ContentExtracted]:
|
||||
"""
|
||||
Extract content from a list of ChatDocument objects.
|
||||
|
||||
|
|
@ -31,6 +31,7 @@ class ExtractionService:
|
|||
Returns:
|
||||
List of ContentExtracted objects, one per input document
|
||||
"""
|
||||
|
||||
results: List[ContentExtracted] = []
|
||||
|
||||
# Lazy import to avoid circular deps and heavy init at module import
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
from typing import Any, Dict, List
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
|
||||
|
||||
|
||||
class DefaultMerger:
|
||||
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
||||
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
||||
"""
|
||||
Default merger that passes through parts unchanged.
|
||||
Used for image, binary, metadata, container typeGroups.
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
from typing import Any, Dict, List
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
|
||||
from ..subUtils import makeId
|
||||
|
||||
|
||||
class TableMerger:
|
||||
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
||||
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
||||
"""
|
||||
Merge table parts based on strategy.
|
||||
Strategy options:
|
||||
|
|
@ -15,9 +15,9 @@ class TableMerger:
|
|||
if not parts:
|
||||
return parts
|
||||
|
||||
groupBy = strategy.get("groupBy", "parentId")
|
||||
maxSize = strategy.get("maxSize", 0)
|
||||
combineSheets = strategy.get("combineSheets", False)
|
||||
groupBy = strategy.groupBy
|
||||
maxSize = strategy.maxSize or 0
|
||||
combineSheets = strategy.tableMerge.get("combineSheets", False) if strategy.tableMerge else False
|
||||
|
||||
# Group parts
|
||||
groups = self._groupParts(parts, groupBy, combineSheets)
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
from typing import Any, Dict, List
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
|
||||
from ..subUtils import makeId
|
||||
|
||||
|
||||
class TextMerger:
|
||||
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
||||
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
||||
"""
|
||||
Merge text parts based on strategy.
|
||||
Strategy options:
|
||||
|
|
@ -15,9 +15,9 @@ class TextMerger:
|
|||
if not parts:
|
||||
return parts
|
||||
|
||||
groupBy = strategy.get("groupBy", "parentId")
|
||||
orderBy = strategy.get("orderBy", "label")
|
||||
maxSize = strategy.get("maxSize", 0)
|
||||
groupBy = strategy.groupBy
|
||||
orderBy = strategy.orderBy
|
||||
maxSize = strategy.maxSize or 0
|
||||
|
||||
# Group parts
|
||||
groups = self._groupParts(parts, groupBy)
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
from typing import Any, Dict, List
|
||||
from typing import List
|
||||
import logging
|
||||
import os
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
|
||||
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy
|
||||
from .subUtils import makeId
|
||||
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
||||
from .merging.mergerText import TextMerger
|
||||
|
|
@ -13,13 +12,13 @@ from .subMerger import IntelligentTokenAwareMerger
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List[ContentPart]:
|
||||
def _mergeParts(parts: List[ContentPart], mergeStrategy: MergeStrategy) -> List[ContentPart]:
|
||||
"""Merge parts based on the provided strategy."""
|
||||
if not parts or not mergeStrategy:
|
||||
return parts
|
||||
|
||||
groupBy = mergeStrategy.get("groupBy", "typeGroup")
|
||||
orderBy = mergeStrategy.get("orderBy", "id")
|
||||
groupBy = mergeStrategy.groupBy
|
||||
orderBy = mergeStrategy.orderBy
|
||||
|
||||
# Group parts by the specified field
|
||||
groups = {}
|
||||
|
|
@ -56,7 +55,8 @@ def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List
|
|||
return merged_parts
|
||||
|
||||
|
||||
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: Dict[str, Any]) -> ContentExtracted:
|
||||
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted:
|
||||
|
||||
extractor = extractorRegistry.resolve(mimeType, fileName)
|
||||
if extractor is None:
|
||||
# fallback: single binary part
|
||||
|
|
@ -71,15 +71,14 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
|
|||
)
|
||||
return ContentExtracted(id=makeId(), parts=[part])
|
||||
|
||||
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options})
|
||||
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType})
|
||||
|
||||
# REMOVED: poolAndLimit(parts, chunkerRegistry, options)
|
||||
# REMOVED: Chunking logic - now handled in AI call phase
|
||||
|
||||
# Apply merging strategy if provided (preserve existing logic)
|
||||
mergeStrategy = options.get("mergeStrategy", {})
|
||||
if mergeStrategy:
|
||||
parts = _applyMerging(parts, mergeStrategy)
|
||||
if options.mergeStrategy:
|
||||
parts = _applyMerging(parts, options.mergeStrategy)
|
||||
|
||||
return ContentExtracted(id=makeId(), parts=parts)
|
||||
|
||||
|
|
@ -87,17 +86,17 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
|
|||
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
|
||||
|
||||
|
||||
def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
||||
def _applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
||||
"""Apply merging strategy to parts with intelligent token-aware merging."""
|
||||
logger.debug(f"_applyMerging called with {len(parts)} parts")
|
||||
|
||||
# Check if intelligent merging is enabled
|
||||
if strategy.get("useIntelligentMerging", False):
|
||||
model_capabilities = strategy.get("capabilities", {})
|
||||
if strategy.useIntelligentMerging:
|
||||
model_capabilities = strategy.capabilities or {}
|
||||
subMerger = IntelligentTokenAwareMerger(model_capabilities)
|
||||
|
||||
# Use intelligent merging for all parts
|
||||
merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
|
||||
merged = subMerger.merge_chunks_intelligently(parts, strategy.prompt or "")
|
||||
|
||||
# Calculate and log optimization stats
|
||||
stats = subMerger.calculate_optimization_stats(parts, merged)
|
||||
|
|
|
|||
|
|
@ -90,7 +90,7 @@ class ModelSelectionTester:
|
|||
totalScore = sizeRating + processingModeRating + priorityRating
|
||||
|
||||
print(
|
||||
f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
|
||||
f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
|
||||
)
|
||||
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
|
||||
|
||||
|
|
@ -136,7 +136,7 @@ class ModelSelectionTester:
|
|||
totalScore = sizeRating + processingModeRating + priorityRating
|
||||
|
||||
print(
|
||||
f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
|
||||
f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
|
||||
)
|
||||
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
|
||||
|
||||
|
|
@ -365,8 +365,8 @@ class ModelSelectionTester:
|
|||
)
|
||||
|
||||
if failoverModelList:
|
||||
print(f" Selected model: {failoverModelList[0].name}")
|
||||
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
|
||||
print(f" Selected model: {failoverModelList[0].displayName}")
|
||||
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
|
||||
else:
|
||||
print(" No suitable models found")
|
||||
|
||||
|
|
@ -393,8 +393,8 @@ class ModelSelectionTester:
|
|||
)
|
||||
|
||||
if failoverModelList:
|
||||
print(f" Selected model: {failoverModelList[0].name}")
|
||||
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
|
||||
print(f" Selected model: {failoverModelList[0].displayName}")
|
||||
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
|
||||
else:
|
||||
print(" No suitable models found")
|
||||
|
||||
|
|
@ -421,8 +421,8 @@ class ModelSelectionTester:
|
|||
)
|
||||
|
||||
if failoverModelList:
|
||||
print(f" Selected model: {failoverModelList[0].name}")
|
||||
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
|
||||
print(f" Selected model: {failoverModelList[0].displayName}")
|
||||
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
|
||||
else:
|
||||
print(" No suitable models found")
|
||||
|
||||
|
|
@ -449,8 +449,8 @@ class ModelSelectionTester:
|
|||
)
|
||||
|
||||
if failoverModelList:
|
||||
print(f" Selected model: {failoverModelList[0].name}")
|
||||
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
|
||||
print(f" Selected model: {failoverModelList[0].displayName}")
|
||||
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
|
||||
else:
|
||||
print(" No suitable models found")
|
||||
|
||||
|
|
@ -479,15 +479,15 @@ class ModelSelectionTester:
|
|||
print(f" {connector_type}: {len(models)} models")
|
||||
for model in models:
|
||||
capabilities = getattr(model, 'capabilities', [])
|
||||
print(f" - {model.name}: {capabilities}")
|
||||
print(f" - {model.displayName}: {capabilities}")
|
||||
|
||||
# Show operation type support
|
||||
print(f"\nOperation type support:")
|
||||
for op_type in OperationTypeEnum:
|
||||
supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and op_type in m.operationTypes]
|
||||
supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and any(ot.operationType == op_type for ot in m.operationTypes)]
|
||||
print(f" {op_type.name}: {len(supported_models)} models")
|
||||
if supported_models:
|
||||
model_names = [m.name for m in supported_models[:3]] # Show first 3 models
|
||||
model_names = [m.displayName for m in supported_models[:3]] # Show first 3 models
|
||||
print(f" Models: {', '.join(model_names)}")
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue