cleanup pydantic v2, unnecessary pdantic to dict convesrions, unnecessary unions removed with clean classes
This commit is contained in:
parent
4f7bba5f33
commit
8523da7fe2
15 changed files with 188 additions and 319 deletions
|
|
@ -50,7 +50,7 @@ class AiAnthropic(BaseConnectorAi):
|
||||||
connectorType="anthropic",
|
connectorType="anthropic",
|
||||||
apiUrl="https://api.anthropic.com/v1/messages",
|
apiUrl="https://api.anthropic.com/v1/messages",
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
maxTokens=200000,
|
maxTokens=8192,
|
||||||
contextLength=200000,
|
contextLength=200000,
|
||||||
costPer1kTokensInput=0.015,
|
costPer1kTokensInput=0.015,
|
||||||
costPer1kTokensOutput=0.075,
|
costPer1kTokensOutput=0.075,
|
||||||
|
|
@ -75,7 +75,7 @@ class AiAnthropic(BaseConnectorAi):
|
||||||
connectorType="anthropic",
|
connectorType="anthropic",
|
||||||
apiUrl="https://api.anthropic.com/v1/messages",
|
apiUrl="https://api.anthropic.com/v1/messages",
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
maxTokens=200000,
|
maxTokens=8192,
|
||||||
contextLength=200000,
|
contextLength=200000,
|
||||||
costPer1kTokensInput=0.015,
|
costPer1kTokensInput=0.015,
|
||||||
costPer1kTokensOutput=0.075,
|
costPer1kTokensOutput=0.075,
|
||||||
|
|
|
||||||
|
|
@ -720,22 +720,9 @@ class DatabaseConnector:
|
||||||
logger.info(f"Initial ID {initialId} for table {table} registered")
|
logger.info(f"Initial ID {initialId} for table {table} registered")
|
||||||
return success
|
return success
|
||||||
else:
|
else:
|
||||||
# Check if the existing initial ID still exists in the table
|
# Table already has an initial ID registered
|
||||||
existingInitialId = systemData[table]
|
logger.debug(f"Table {table} already has initial ID {systemData[table]}")
|
||||||
records = self.getRecordset(
|
return True
|
||||||
model_class, recordFilter={"id": existingInitialId}
|
|
||||||
)
|
|
||||||
if not records:
|
|
||||||
# The initial record no longer exists, update to the new one
|
|
||||||
systemData[table] = initialId
|
|
||||||
success = self._saveSystemTable(systemData)
|
|
||||||
if success:
|
|
||||||
logger.info(
|
|
||||||
f"Initial ID updated from {existingInitialId} to {initialId} for table {table}"
|
|
||||||
)
|
|
||||||
return success
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error registering the initial ID for table {table}: {e}")
|
logger.error(f"Error registering the initial ID for table {table}: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
|
||||||
|
|
@ -135,7 +135,6 @@ class AiCallOptions(BaseModel):
|
||||||
compressPrompt: bool = Field(default=True, description="Whether to compress the prompt")
|
compressPrompt: bool = Field(default=True, description="Whether to compress the prompt")
|
||||||
compressContext: bool = Field(default=True, description="If False: process each chunk; If True: summarize and work on summary")
|
compressContext: bool = Field(default=True, description="If False: process each chunk; If True: summarize and work on summary")
|
||||||
processDocumentsIndividually: bool = Field(default=True, description="If True, process each document separately; else pool docs")
|
processDocumentsIndividually: bool = Field(default=True, description="If True, process each document separately; else pool docs")
|
||||||
maxContextBytes: Optional[int] = Field(default=None, description="Hard cap for extracted context size passed to the model")
|
|
||||||
maxCost: Optional[float] = Field(default=None, description="Max cost budget")
|
maxCost: Optional[float] = Field(default=None, description="Max cost budget")
|
||||||
maxProcessingTime: Optional[int] = Field(default=None, description="Max processing time in seconds")
|
maxProcessingTime: Optional[int] = Field(default=None, description="Max processing time in seconds")
|
||||||
processingMode: ProcessingModeEnum = Field(default=ProcessingModeEnum.BASIC, description="Processing mode")
|
processingMode: ProcessingModeEnum = Field(default=ProcessingModeEnum.BASIC, description="Processing mode")
|
||||||
|
|
@ -145,7 +144,6 @@ class AiCallOptions(BaseModel):
|
||||||
|
|
||||||
# Model generation parameters
|
# Model generation parameters
|
||||||
temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0, description="Temperature for response generation (0.0-2.0, lower = more consistent)")
|
temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0, description="Temperature for response generation (0.0-2.0, lower = more consistent)")
|
||||||
maxTokens: Optional[int] = Field(default=None, ge=1, le=32000, description="Maximum tokens in response")
|
|
||||||
maxParts: Optional[int] = Field(default=1000, ge=1, le=1000, description="Maximum number of continuation parts to fetch")
|
maxParts: Optional[int] = Field(default=1000, ge=1, le=1000, description="Maximum number of continuation parts to fetch")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -170,32 +168,6 @@ class AiCallResponse(BaseModel):
|
||||||
errorCount: int = Field(default=0, description="0 for success, 1+ for errors")
|
errorCount: int = Field(default=0, description="0 for success, 1+ for errors")
|
||||||
|
|
||||||
|
|
||||||
class EnhancedAiCallOptions(AiCallOptions):
|
|
||||||
"""Enhanced options for improved document processing with chunk mapping."""
|
|
||||||
|
|
||||||
# Parallel processing
|
|
||||||
enableParallelProcessing: bool = Field(
|
|
||||||
default=True,
|
|
||||||
description="Enable parallel processing of chunks"
|
|
||||||
)
|
|
||||||
maxConcurrentChunks: int = Field(
|
|
||||||
default=5,
|
|
||||||
ge=1,
|
|
||||||
le=20,
|
|
||||||
description="Maximum number of chunks to process concurrently"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Chunk mapping
|
|
||||||
preserveChunkMetadata: bool = Field(
|
|
||||||
default=True,
|
|
||||||
description="Preserve chunk metadata during processing"
|
|
||||||
)
|
|
||||||
chunkSeparator: str = Field(
|
|
||||||
default="\n\n---\n\n",
|
|
||||||
description="Separator between chunks in merged output"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class AiModelCall(BaseModel):
|
class AiModelCall(BaseModel):
|
||||||
"""Standardized input for AI model calls."""
|
"""Standardized input for AI model calls."""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,9 @@
|
||||||
from typing import Any, Dict, List, Optional, Literal
|
from typing import Any, Dict, List, Optional, Literal, TYPE_CHECKING
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from modules.datamodels.datamodelAi import OperationTypeEnum
|
||||||
|
|
||||||
|
|
||||||
class ContentPart(BaseModel):
|
class ContentPart(BaseModel):
|
||||||
id: str = Field(description="Unique content part identifier")
|
id: str = Field(description="Unique content part identifier")
|
||||||
|
|
@ -40,106 +43,49 @@ class PartResult(BaseModel):
|
||||||
|
|
||||||
class MergeStrategy(BaseModel):
|
class MergeStrategy(BaseModel):
|
||||||
"""Strategy configuration for merging content parts and AI results."""
|
"""Strategy configuration for merging content parts and AI results."""
|
||||||
|
groupBy: str = Field(default="typeGroup", description="Field to group parts by (typeGroup, parentId, label, etc.)")
|
||||||
|
orderBy: str = Field(default="id", description="Field to order parts within groups (id, order, pageIndex, etc.)")
|
||||||
|
mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(default="concatenate", description="How to merge content within groups")
|
||||||
|
maxSize: Optional[int] = Field(default=None, description="Maximum size for merged content in bytes")
|
||||||
|
textMerge: Optional[Dict[str, Any]] = Field(default=None, description="Text-specific merge settings (separator, formatting, etc.)")
|
||||||
|
tableMerge: Optional[Dict[str, Any]] = Field(default=None, description="Table-specific merge settings (header handling, etc.)")
|
||||||
|
structureMerge: Optional[Dict[str, Any]] = Field(default=None, description="Structure-specific merge settings (hierarchy, etc.)")
|
||||||
|
aiResultMerge: Optional[Dict[str, Any]] = Field(default=None, description="AI result merging settings (prompt, context, etc.)")
|
||||||
|
preserveChunks: bool = Field(default=False, description="Whether to preserve individual chunks or merge them")
|
||||||
|
chunkSeparator: str = Field(default="\n\n---\n\n", description="Separator between chunks when merging")
|
||||||
|
preserveMetadata: bool = Field(default=True, description="Whether to preserve metadata from original parts")
|
||||||
|
metadataFields: Optional[List[str]] = Field(default=None, description="Specific metadata fields to preserve (None = all)")
|
||||||
|
onError: Literal["skip", "include", "fail"] = Field(default="skip", description="How to handle errors during merging")
|
||||||
|
validateContent: bool = Field(default=True, description="Whether to validate content before merging")
|
||||||
|
useIntelligentMerging: bool = Field(default=False, description="Whether to use intelligent token-aware merging")
|
||||||
|
prompt: Optional[str] = Field(default=None, description="Prompt for intelligent merging")
|
||||||
|
capabilities: Optional[Dict[str, Any]] = Field(default=None, description="Model capabilities for intelligent merging")
|
||||||
|
|
||||||
# Grouping configuration
|
|
||||||
groupBy: str = Field(
|
|
||||||
default="typeGroup",
|
|
||||||
description="Field to group parts by (typeGroup, parentId, label, etc.)"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ordering configuration
|
class ExtractionOptions(BaseModel):
|
||||||
orderBy: str = Field(
|
"""Options for document extraction and processing with clear data structures."""
|
||||||
default="id",
|
|
||||||
description="Field to order parts within groups (id, order, pageIndex, etc.)"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Merge behavior
|
# Core extraction parameters
|
||||||
mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(
|
prompt: str = Field(description="Extraction prompt for AI processing")
|
||||||
default="concatenate",
|
operationType: 'OperationTypeEnum' = Field(description="Type of operation for AI processing")
|
||||||
description="How to merge content within groups"
|
processDocumentsIndividually: bool = Field(default=True, description="Process each document separately")
|
||||||
)
|
|
||||||
|
|
||||||
# Size limits
|
# Image processing parameters
|
||||||
maxSize: Optional[int] = Field(
|
imageMaxPixels: int = Field(default=1024 * 1024, ge=1, description="Maximum pixels for image processing")
|
||||||
default=None,
|
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
|
||||||
description="Maximum size for merged content in bytes"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Type-specific merge settings
|
# Merging strategy
|
||||||
textMerge: Optional[Dict[str, Any]] = Field(
|
mergeStrategy: MergeStrategy = Field(description="Strategy for merging extraction results")
|
||||||
default=None,
|
|
||||||
description="Text-specific merge settings (separator, formatting, etc.)"
|
|
||||||
)
|
|
||||||
|
|
||||||
tableMerge: Optional[Dict[str, Any]] = Field(
|
# Optional chunking parameters (for backward compatibility)
|
||||||
default=None,
|
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")
|
||||||
description="Table-specific merge settings (header handling, etc.)"
|
maxSize: Optional[int] = Field(default=None, description="Maximum size for processing")
|
||||||
)
|
textChunkSize: Optional[int] = Field(default=None, description="Size for text chunks")
|
||||||
|
imageChunkSize: Optional[int] = Field(default=None, description="Size for image chunks")
|
||||||
|
|
||||||
structureMerge: Optional[Dict[str, Any]] = Field(
|
# Additional processing options
|
||||||
default=None,
|
enableParallelProcessing: bool = Field(default=True, description="Enable parallel processing of chunks")
|
||||||
description="Structure-specific merge settings (hierarchy, etc.)"
|
maxConcurrentChunks: int = Field(default=5, ge=1, le=20, description="Maximum number of chunks to process concurrently")
|
||||||
)
|
|
||||||
|
|
||||||
# AI result merging
|
|
||||||
aiResultMerge: Optional[Dict[str, Any]] = Field(
|
|
||||||
default=None,
|
|
||||||
description="AI result merging settings (prompt, context, etc.)"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Chunk handling
|
|
||||||
preserveChunks: bool = Field(
|
|
||||||
default=False,
|
|
||||||
description="Whether to preserve individual chunks or merge them"
|
|
||||||
)
|
|
||||||
|
|
||||||
chunkSeparator: str = Field(
|
|
||||||
default="\n\n---\n\n",
|
|
||||||
description="Separator between chunks when merging"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Metadata handling
|
|
||||||
preserveMetadata: bool = Field(
|
|
||||||
default=True,
|
|
||||||
description="Whether to preserve metadata from original parts"
|
|
||||||
)
|
|
||||||
|
|
||||||
metadataFields: Optional[List[str]] = Field(
|
|
||||||
default=None,
|
|
||||||
description="Specific metadata fields to preserve (None = all)"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Error handling
|
|
||||||
onError: Literal["skip", "include", "fail"] = Field(
|
|
||||||
default="skip",
|
|
||||||
description="How to handle errors during merging"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Validation
|
|
||||||
validateContent: bool = Field(
|
|
||||||
default=True,
|
|
||||||
description="Whether to validate content before merging"
|
|
||||||
)
|
|
||||||
|
|
||||||
def getTypeSpecificSettings(self, typeGroup: str) -> Dict[str, Any]:
|
|
||||||
"""Get type-specific merge settings for a content type."""
|
|
||||||
if typeGroup == "text" and self.textMerge:
|
|
||||||
return self.textMerge
|
|
||||||
elif typeGroup == "table" and self.tableMerge:
|
|
||||||
return self.tableMerge
|
|
||||||
elif typeGroup == "structure" and self.structureMerge:
|
|
||||||
return self.structureMerge
|
|
||||||
else:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def shouldPreserveChunk(self, chunk: Dict[str, Any]) -> bool:
|
|
||||||
"""Determine if a chunk should be preserved based on strategy."""
|
|
||||||
if not self.preserveChunks:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check if chunk has error metadata
|
|
||||||
if self.onError == "skip" and chunk.get("metadata", {}).get("error"):
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
arbitraryTypesAllowed = True # Allow OperationTypeEnum import
|
||||||
|
|
|
||||||
|
|
@ -127,7 +127,7 @@ class AiObjects:
|
||||||
logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
|
logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
|
||||||
|
|
||||||
# Call the model
|
# Call the model
|
||||||
response = await self._callWithModel(model, prompt, context)
|
response = await self._callWithModel(model, prompt, context, options)
|
||||||
|
|
||||||
logger.info(f"✅ AI call successful with model: {model.name}")
|
logger.info(f"✅ AI call successful with model: {model.name}")
|
||||||
return response
|
return response
|
||||||
|
|
@ -204,7 +204,7 @@ class AiObjects:
|
||||||
|
|
||||||
if partSize <= modelContextBytes:
|
if partSize <= modelContextBytes:
|
||||||
# Part fits - call AI directly
|
# Part fits - call AI directly
|
||||||
response = await self._callWithModel(model, prompt, contentPart.data)
|
response = await self._callWithModel(model, prompt, contentPart.data, options)
|
||||||
logger.info(f"✅ Content part processed successfully with model: {model.name}")
|
logger.info(f"✅ Content part processed successfully with model: {model.name}")
|
||||||
return response
|
return response
|
||||||
else:
|
else:
|
||||||
|
|
@ -216,7 +216,7 @@ class AiObjects:
|
||||||
# Process each chunk
|
# Process each chunk
|
||||||
chunkResults = []
|
chunkResults = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
chunkResponse = await self._callWithModel(model, prompt, chunk['data'])
|
chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
|
||||||
chunkResults.append(chunkResponse)
|
chunkResults.append(chunkResponse)
|
||||||
|
|
||||||
# Merge chunk results
|
# Merge chunk results
|
||||||
|
|
@ -393,7 +393,7 @@ class AiObjects:
|
||||||
errorCount=1
|
errorCount=1
|
||||||
)
|
)
|
||||||
|
|
||||||
async def _callWithModel(self, model: AiModel, prompt: str, context: str) -> AiCallResponse:
|
async def _callWithModel(self, model: AiModel, prompt: str, context: str, options: AiCallOptions = None) -> AiCallResponse:
|
||||||
"""Call a specific model and return the response."""
|
"""Call a specific model and return the response."""
|
||||||
# Calculate input bytes from prompt and context
|
# Calculate input bytes from prompt and context
|
||||||
inputBytes = len((prompt + context).encode('utf-8'))
|
inputBytes = len((prompt + context).encode('utf-8'))
|
||||||
|
|
@ -430,7 +430,8 @@ class AiObjects:
|
||||||
# Create standardized call object
|
# Create standardized call object
|
||||||
modelCall = AiModelCall(
|
modelCall = AiModelCall(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
model=model
|
model=model,
|
||||||
|
options=options or {}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Call the model with standardized interface
|
# Call the model with standardized interface
|
||||||
|
|
|
||||||
|
|
@ -873,7 +873,7 @@ class ChatObjects:
|
||||||
stat = ChatStat(**statData)
|
stat = ChatStat(**statData)
|
||||||
|
|
||||||
# Create the stat record in the database
|
# Create the stat record in the database
|
||||||
created = self.db.recordCreate(ChatStat, stat.model_dump())
|
created = self.db.recordCreate(ChatStat, stat)
|
||||||
|
|
||||||
# Return the created ChatStat
|
# Return the created ChatStat
|
||||||
return ChatStat(**created)
|
return ChatStat(**created)
|
||||||
|
|
@ -937,7 +937,7 @@ class ChatObjects:
|
||||||
items.append({
|
items.append({
|
||||||
"type": "message",
|
"type": "message",
|
||||||
"createdAt": msg_timestamp,
|
"createdAt": msg_timestamp,
|
||||||
"item": chat_message.model_dump()
|
"item": chat_message
|
||||||
})
|
})
|
||||||
|
|
||||||
# Get logs
|
# Get logs
|
||||||
|
|
@ -952,7 +952,7 @@ class ChatObjects:
|
||||||
items.append({
|
items.append({
|
||||||
"type": "log",
|
"type": "log",
|
||||||
"createdAt": log_timestamp,
|
"createdAt": log_timestamp,
|
||||||
"item": chat_log.model_dump()
|
"item": chat_log
|
||||||
})
|
})
|
||||||
|
|
||||||
# Get stats list
|
# Get stats list
|
||||||
|
|
@ -966,7 +966,7 @@ class ChatObjects:
|
||||||
items.append({
|
items.append({
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"createdAt": stat_timestamp,
|
"createdAt": stat_timestamp,
|
||||||
"item": stat.model_dump()
|
"item": stat
|
||||||
})
|
})
|
||||||
|
|
||||||
# Sort all items by createdAt timestamp for chronological order
|
# Sort all items by createdAt timestamp for chronological order
|
||||||
|
|
|
||||||
|
|
@ -48,11 +48,8 @@ async def create_prompt(
|
||||||
"""Create a new prompt"""
|
"""Create a new prompt"""
|
||||||
managementInterface = interfaceDbComponentObjects.getInterface(currentUser)
|
managementInterface = interfaceDbComponentObjects.getInterface(currentUser)
|
||||||
|
|
||||||
# Convert Prompt to dict for interface
|
|
||||||
prompt_data = prompt.model_dump()
|
|
||||||
|
|
||||||
# Create prompt
|
# Create prompt
|
||||||
newPrompt = managementInterface.createPrompt(prompt_data)
|
newPrompt = managementInterface.createPrompt(prompt)
|
||||||
|
|
||||||
return Prompt(**newPrompt)
|
return Prompt(**newPrompt)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -92,11 +92,8 @@ async def create_user(
|
||||||
"""Create a new user"""
|
"""Create a new user"""
|
||||||
appInterface = interfaceDbAppObjects.getInterface(currentUser)
|
appInterface = interfaceDbAppObjects.getInterface(currentUser)
|
||||||
|
|
||||||
# Convert User to dict for interface
|
|
||||||
user_dict = user_data.model_dump()
|
|
||||||
|
|
||||||
# Create user
|
# Create user
|
||||||
newUser = appInterface.createUser(user_dict)
|
newUser = appInterface.createUser(user_data)
|
||||||
|
|
||||||
return newUser
|
return newUser
|
||||||
|
|
||||||
|
|
@ -119,11 +116,8 @@ async def update_user(
|
||||||
detail=f"User with ID {userId} not found"
|
detail=f"User with ID {userId} not found"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert User to dict for interface
|
|
||||||
update_data = userData.model_dump()
|
|
||||||
|
|
||||||
# Update user
|
# Update user
|
||||||
updatedUser = appInterface.updateUser(userId, update_data)
|
updatedUser = appInterface.updateUser(userId, userData)
|
||||||
|
|
||||||
if not updatedUser:
|
if not updatedUser:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ import time
|
||||||
from typing import Dict, Any, List, Optional, Tuple, Union
|
from typing import Dict, Any, List, Optional, Tuple, Union
|
||||||
from modules.datamodels.datamodelChat import ChatDocument
|
from modules.datamodels.datamodelChat import ChatDocument
|
||||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
|
||||||
from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult
|
from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted, PartResult, ExtractionOptions, MergeStrategy
|
||||||
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
|
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -33,19 +33,6 @@ class SubDocumentProcessing:
|
||||||
self._extractionService = ExtractionService(self.services)
|
self._extractionService = ExtractionService(self.services)
|
||||||
return self._extractionService
|
return self._extractionService
|
||||||
|
|
||||||
def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int:
|
|
||||||
"""Calculate maximum context bytes based on model capabilities and options."""
|
|
||||||
if options and options.maxContextBytes:
|
|
||||||
return options.maxContextBytes
|
|
||||||
|
|
||||||
# Default model capabilities (this should be enhanced with actual model registry)
|
|
||||||
defaultMaxTokens = 4000
|
|
||||||
safetyMargin = options.safetyMargin if options else 0.1
|
|
||||||
|
|
||||||
# Calculate bytes (4 chars per token estimation)
|
|
||||||
maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4)
|
|
||||||
|
|
||||||
return maxContextBytes
|
|
||||||
|
|
||||||
async def processDocumentsPerChunk(
|
async def processDocumentsPerChunk(
|
||||||
self,
|
self,
|
||||||
|
|
@ -68,22 +55,23 @@ class SubDocumentProcessing:
|
||||||
if not documents:
|
if not documents:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# Build extraction options WITHOUT chunking parameters
|
# Build extraction options using Pydantic model
|
||||||
extractionOptions: Dict[str, Any] = {
|
mergeStrategy = MergeStrategy(
|
||||||
"prompt": prompt,
|
useIntelligentMerging=True,
|
||||||
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
prompt=prompt,
|
||||||
"processDocumentsIndividually": True,
|
groupBy="typeGroup",
|
||||||
# REMOVED: maxSize, textChunkSize, imageChunkSize
|
orderBy="id",
|
||||||
"mergeStrategy": {
|
mergeType="concatenate"
|
||||||
"useIntelligentMerging": True,
|
)
|
||||||
"prompt": prompt,
|
|
||||||
"groupBy": "typeGroup",
|
|
||||||
"orderBy": "id",
|
|
||||||
"mergeType": "concatenate"
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
|
extractionOptions = ExtractionOptions(
|
||||||
|
prompt=prompt,
|
||||||
|
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
||||||
|
processDocumentsIndividually=True,
|
||||||
|
mergeStrategy=mergeStrategy
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Extract content WITHOUT chunking
|
# Extract content WITHOUT chunking
|
||||||
|
|
@ -120,21 +108,23 @@ class SubDocumentProcessing:
|
||||||
if not documents:
|
if not documents:
|
||||||
return {"metadata": {"title": "Empty Document"}, "sections": []}
|
return {"metadata": {"title": "Empty Document"}, "sections": []}
|
||||||
|
|
||||||
# Build extraction options WITHOUT chunking parameters
|
# Build extraction options using Pydantic model
|
||||||
extractionOptions: Dict[str, Any] = {
|
mergeStrategy = MergeStrategy(
|
||||||
"prompt": prompt,
|
useIntelligentMerging=True,
|
||||||
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
prompt=prompt,
|
||||||
"processDocumentsIndividually": True,
|
groupBy="typeGroup",
|
||||||
"mergeStrategy": {
|
orderBy="id",
|
||||||
"useIntelligentMerging": True,
|
mergeType="concatenate"
|
||||||
"prompt": prompt,
|
)
|
||||||
"groupBy": "typeGroup",
|
|
||||||
"orderBy": "id",
|
|
||||||
"mergeType": "concatenate"
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
|
extractionOptions = ExtractionOptions(
|
||||||
|
prompt=prompt,
|
||||||
|
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
||||||
|
processDocumentsIndividually=True,
|
||||||
|
mergeStrategy=mergeStrategy
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Extract content WITHOUT chunking
|
# Extract content WITHOUT chunking
|
||||||
|
|
@ -205,31 +195,25 @@ class SubDocumentProcessing:
|
||||||
if not documents:
|
if not documents:
|
||||||
return {"metadata": {"title": "Empty Document"}, "sections": []}
|
return {"metadata": {"title": "Empty Document"}, "sections": []}
|
||||||
|
|
||||||
# Get model capabilities for size calculation
|
# Build extraction options using Pydantic model (model-aware chunking in AI call phase)
|
||||||
model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options)
|
mergeStrategy = MergeStrategy(
|
||||||
|
useIntelligentMerging=True,
|
||||||
|
prompt=custom_prompt,
|
||||||
|
groupBy="typeGroup",
|
||||||
|
orderBy="id",
|
||||||
|
mergeType="concatenate"
|
||||||
|
)
|
||||||
|
|
||||||
# Build extraction options for chunking with intelligent merging
|
extractionOptions = ExtractionOptions(
|
||||||
extractionOptions: Dict[str, Any] = {
|
prompt=custom_prompt, # Use the custom prompt instead of default
|
||||||
"prompt": custom_prompt, # Use the custom prompt instead of default
|
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
||||||
"operationType": options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
processDocumentsIndividually=True, # Process each document separately
|
||||||
"processDocumentsIndividually": True, # Process each document separately
|
imageMaxPixels=1024 * 1024,
|
||||||
"maxSize": model_capabilities["maxContextBytes"],
|
imageQuality=85,
|
||||||
"chunkAllowed": True,
|
mergeStrategy=mergeStrategy
|
||||||
"textChunkSize": model_capabilities["textChunkSize"],
|
)
|
||||||
"imageChunkSize": model_capabilities["imageChunkSize"],
|
|
||||||
"imageMaxPixels": 1024 * 1024,
|
|
||||||
"imageQuality": 85,
|
|
||||||
"mergeStrategy": {
|
|
||||||
"useIntelligentMerging": True, # Enable intelligent token-aware merging
|
|
||||||
"capabilities": model_capabilities,
|
|
||||||
"prompt": custom_prompt, # Use the custom prompt
|
|
||||||
"groupBy": "typeGroup",
|
|
||||||
"orderBy": "id",
|
|
||||||
"mergeType": "concatenate"
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
|
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Extract content with chunking
|
# Extract content with chunking
|
||||||
|
|
@ -1042,15 +1026,13 @@ CONTINUATION INSTRUCTIONS:
|
||||||
content_parts.append(content_part)
|
content_parts.append(content_part)
|
||||||
|
|
||||||
# Use existing merging strategy from options
|
# Use existing merging strategy from options
|
||||||
merge_strategy = {
|
merge_strategy = MergeStrategy(
|
||||||
"useIntelligentMerging": True,
|
useIntelligentMerging=True,
|
||||||
"groupBy": "documentId", # Group by document
|
groupBy="documentId", # Group by document
|
||||||
"orderBy": "partIndex", # Order by part index
|
orderBy="partIndex", # Order by part index
|
||||||
"mergeType": "concatenate"
|
mergeType="concatenate"
|
||||||
}
|
)
|
||||||
|
|
||||||
if options and hasattr(options, 'mergeStrategy'):
|
|
||||||
merge_strategy.update(options.mergeStrategy)
|
|
||||||
|
|
||||||
# Apply existing merging logic using the sophisticated merging system
|
# Apply existing merging logic using the sophisticated merging system
|
||||||
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
||||||
|
|
@ -1095,15 +1077,13 @@ CONTINUATION INSTRUCTIONS:
|
||||||
content_parts.append(content_part)
|
content_parts.append(content_part)
|
||||||
|
|
||||||
# Use existing merging strategy for JSON mode
|
# Use existing merging strategy for JSON mode
|
||||||
merge_strategy = {
|
merge_strategy = MergeStrategy(
|
||||||
"useIntelligentMerging": True,
|
useIntelligentMerging=True,
|
||||||
"groupBy": "documentId", # Group by document
|
groupBy="documentId", # Group by document
|
||||||
"orderBy": "partIndex", # Order by part index
|
orderBy="partIndex", # Order by part index
|
||||||
"mergeType": "concatenate"
|
mergeType="concatenate"
|
||||||
}
|
)
|
||||||
|
|
||||||
if options and hasattr(options, 'mergeStrategy'):
|
|
||||||
merge_strategy.update(options.mergeStrategy)
|
|
||||||
|
|
||||||
# Apply existing merging logic using the sophisticated merging system
|
# Apply existing merging logic using the sophisticated merging system
|
||||||
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
||||||
|
|
@ -1234,15 +1214,13 @@ CONTINUATION INSTRUCTIONS:
|
||||||
content_parts.append(content_part)
|
content_parts.append(content_part)
|
||||||
|
|
||||||
# Use existing merging strategy from options
|
# Use existing merging strategy from options
|
||||||
merge_strategy = {
|
merge_strategy = MergeStrategy(
|
||||||
"useIntelligentMerging": True,
|
useIntelligentMerging=True,
|
||||||
"groupBy": "documentId", # Group by document
|
groupBy="documentId", # Group by document
|
||||||
"orderBy": "chunkIndex", # Order by chunk index
|
orderBy="chunkIndex", # Order by chunk index
|
||||||
"mergeType": "concatenate"
|
mergeType="concatenate"
|
||||||
}
|
)
|
||||||
|
|
||||||
if options and hasattr(options, 'mergeStrategy'):
|
|
||||||
merge_strategy.update(options.mergeStrategy)
|
|
||||||
|
|
||||||
# Apply existing merging logic using the sophisticated merging system
|
# Apply existing merging logic using the sophisticated merging system
|
||||||
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
||||||
|
|
@ -1297,15 +1275,13 @@ CONTINUATION INSTRUCTIONS:
|
||||||
content_parts.append(content_part)
|
content_parts.append(content_part)
|
||||||
|
|
||||||
# Use existing merging strategy for clean mode
|
# Use existing merging strategy for clean mode
|
||||||
merge_strategy = {
|
merge_strategy = MergeStrategy(
|
||||||
"useIntelligentMerging": True,
|
useIntelligentMerging=True,
|
||||||
"groupBy": "documentId", # Group by document
|
groupBy="documentId", # Group by document
|
||||||
"orderBy": "chunkIndex", # Order by chunk index
|
orderBy="chunkIndex", # Order by chunk index
|
||||||
"mergeType": "concatenate"
|
mergeType="concatenate"
|
||||||
}
|
)
|
||||||
|
|
||||||
if options and hasattr(options, 'mergeStrategy'):
|
|
||||||
merge_strategy.update(options.mergeStrategy)
|
|
||||||
|
|
||||||
# Apply existing merging logic using the sophisticated merging system
|
# Apply existing merging logic using the sophisticated merging system
|
||||||
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
||||||
|
|
@ -1351,15 +1327,13 @@ CONTINUATION INSTRUCTIONS:
|
||||||
content_parts.append(content_part)
|
content_parts.append(content_part)
|
||||||
|
|
||||||
# Use existing merging strategy for JSON mode
|
# Use existing merging strategy for JSON mode
|
||||||
merge_strategy = {
|
merge_strategy = MergeStrategy(
|
||||||
"useIntelligentMerging": True,
|
useIntelligentMerging=True,
|
||||||
"groupBy": "documentId", # Group by document
|
groupBy="documentId", # Group by document
|
||||||
"orderBy": "chunkIndex", # Order by chunk index
|
orderBy="chunkIndex", # Order by chunk index
|
||||||
"mergeType": "concatenate"
|
mergeType="concatenate"
|
||||||
}
|
)
|
||||||
|
|
||||||
if options and hasattr(options, 'mergeStrategy'):
|
|
||||||
merge_strategy.update(options.mergeStrategy)
|
|
||||||
|
|
||||||
# Apply existing merging logic using the sophisticated merging system
|
# Apply existing merging logic using the sophisticated merging system
|
||||||
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
||||||
|
|
@ -1455,5 +1429,3 @@ CONTINUATION INSTRUCTIONS:
|
||||||
|
|
||||||
logger.info(f"Merged {len(chunkResults)} chunks using existing sophisticated merging system (JSON mode)")
|
logger.info(f"Merged {len(chunkResults)} chunks using existing sophisticated merging system (JSON mode)")
|
||||||
return merged_document
|
return merged_document
|
||||||
|
|
||||||
# REMOVED: _getModelCapabilitiesForContent method - no longer needed with model-aware chunking
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ import time
|
||||||
|
|
||||||
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
||||||
from .subPipeline import runExtraction
|
from .subPipeline import runExtraction
|
||||||
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions
|
||||||
from modules.datamodels.datamodelChat import ChatDocument
|
from modules.datamodels.datamodelChat import ChatDocument
|
||||||
from modules.datamodels.datamodelAi import AiCallResponse
|
from modules.datamodels.datamodelAi import AiCallResponse
|
||||||
from modules.aicore.aicoreModelRegistry import modelRegistry
|
from modules.aicore.aicoreModelRegistry import modelRegistry
|
||||||
|
|
@ -20,7 +20,7 @@ class ExtractionService:
|
||||||
self._extractorRegistry = ExtractorRegistry()
|
self._extractorRegistry = ExtractorRegistry()
|
||||||
self._chunkerRegistry = ChunkerRegistry()
|
self._chunkerRegistry = ChunkerRegistry()
|
||||||
|
|
||||||
def extractContent(self, documents: List[ChatDocument], options: Dict[str, Any]) -> List[ContentExtracted]:
|
def extractContent(self, documents: List[ChatDocument], options: ExtractionOptions) -> List[ContentExtracted]:
|
||||||
"""
|
"""
|
||||||
Extract content from a list of ChatDocument objects.
|
Extract content from a list of ChatDocument objects.
|
||||||
|
|
||||||
|
|
@ -31,6 +31,7 @@ class ExtractionService:
|
||||||
Returns:
|
Returns:
|
||||||
List of ContentExtracted objects, one per input document
|
List of ContentExtracted objects, one per input document
|
||||||
"""
|
"""
|
||||||
|
|
||||||
results: List[ContentExtracted] = []
|
results: List[ContentExtracted] = []
|
||||||
|
|
||||||
# Lazy import to avoid circular deps and heavy init at module import
|
# Lazy import to avoid circular deps and heavy init at module import
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List
|
||||||
from modules.datamodels.datamodelExtraction import ContentPart
|
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
|
||||||
|
|
||||||
|
|
||||||
class DefaultMerger:
|
class DefaultMerger:
|
||||||
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
||||||
"""
|
"""
|
||||||
Default merger that passes through parts unchanged.
|
Default merger that passes through parts unchanged.
|
||||||
Used for image, binary, metadata, container typeGroups.
|
Used for image, binary, metadata, container typeGroups.
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,10 @@
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List
|
||||||
from modules.datamodels.datamodelExtraction import ContentPart
|
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
|
||||||
from ..subUtils import makeId
|
from ..subUtils import makeId
|
||||||
|
|
||||||
|
|
||||||
class TableMerger:
|
class TableMerger:
|
||||||
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
||||||
"""
|
"""
|
||||||
Merge table parts based on strategy.
|
Merge table parts based on strategy.
|
||||||
Strategy options:
|
Strategy options:
|
||||||
|
|
@ -15,9 +15,9 @@ class TableMerger:
|
||||||
if not parts:
|
if not parts:
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
groupBy = strategy.get("groupBy", "parentId")
|
groupBy = strategy.groupBy
|
||||||
maxSize = strategy.get("maxSize", 0)
|
maxSize = strategy.maxSize or 0
|
||||||
combineSheets = strategy.get("combineSheets", False)
|
combineSheets = strategy.tableMerge.get("combineSheets", False) if strategy.tableMerge else False
|
||||||
|
|
||||||
# Group parts
|
# Group parts
|
||||||
groups = self._groupParts(parts, groupBy, combineSheets)
|
groups = self._groupParts(parts, groupBy, combineSheets)
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,10 @@
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List
|
||||||
from modules.datamodels.datamodelExtraction import ContentPart
|
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
|
||||||
from ..subUtils import makeId
|
from ..subUtils import makeId
|
||||||
|
|
||||||
|
|
||||||
class TextMerger:
|
class TextMerger:
|
||||||
def merge(self, parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
||||||
"""
|
"""
|
||||||
Merge text parts based on strategy.
|
Merge text parts based on strategy.
|
||||||
Strategy options:
|
Strategy options:
|
||||||
|
|
@ -15,9 +15,9 @@ class TextMerger:
|
||||||
if not parts:
|
if not parts:
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
groupBy = strategy.get("groupBy", "parentId")
|
groupBy = strategy.groupBy
|
||||||
orderBy = strategy.get("orderBy", "label")
|
orderBy = strategy.orderBy
|
||||||
maxSize = strategy.get("maxSize", 0)
|
maxSize = strategy.maxSize or 0
|
||||||
|
|
||||||
# Group parts
|
# Group parts
|
||||||
groups = self._groupParts(parts, groupBy)
|
groups = self._groupParts(parts, groupBy)
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,7 @@
|
||||||
from typing import Any, Dict, List
|
from typing import List
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
|
|
||||||
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy
|
||||||
from .subUtils import makeId
|
from .subUtils import makeId
|
||||||
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
||||||
from .merging.mergerText import TextMerger
|
from .merging.mergerText import TextMerger
|
||||||
|
|
@ -13,13 +12,13 @@ from .subMerger import IntelligentTokenAwareMerger
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List[ContentPart]:
|
def _mergeParts(parts: List[ContentPart], mergeStrategy: MergeStrategy) -> List[ContentPart]:
|
||||||
"""Merge parts based on the provided strategy."""
|
"""Merge parts based on the provided strategy."""
|
||||||
if not parts or not mergeStrategy:
|
if not parts or not mergeStrategy:
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
groupBy = mergeStrategy.get("groupBy", "typeGroup")
|
groupBy = mergeStrategy.groupBy
|
||||||
orderBy = mergeStrategy.get("orderBy", "id")
|
orderBy = mergeStrategy.orderBy
|
||||||
|
|
||||||
# Group parts by the specified field
|
# Group parts by the specified field
|
||||||
groups = {}
|
groups = {}
|
||||||
|
|
@ -56,7 +55,8 @@ def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List
|
||||||
return merged_parts
|
return merged_parts
|
||||||
|
|
||||||
|
|
||||||
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: Dict[str, Any]) -> ContentExtracted:
|
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted:
|
||||||
|
|
||||||
extractor = extractorRegistry.resolve(mimeType, fileName)
|
extractor = extractorRegistry.resolve(mimeType, fileName)
|
||||||
if extractor is None:
|
if extractor is None:
|
||||||
# fallback: single binary part
|
# fallback: single binary part
|
||||||
|
|
@ -71,15 +71,14 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
|
||||||
)
|
)
|
||||||
return ContentExtracted(id=makeId(), parts=[part])
|
return ContentExtracted(id=makeId(), parts=[part])
|
||||||
|
|
||||||
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options})
|
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType})
|
||||||
|
|
||||||
# REMOVED: poolAndLimit(parts, chunkerRegistry, options)
|
# REMOVED: poolAndLimit(parts, chunkerRegistry, options)
|
||||||
# REMOVED: Chunking logic - now handled in AI call phase
|
# REMOVED: Chunking logic - now handled in AI call phase
|
||||||
|
|
||||||
# Apply merging strategy if provided (preserve existing logic)
|
# Apply merging strategy if provided (preserve existing logic)
|
||||||
mergeStrategy = options.get("mergeStrategy", {})
|
if options.mergeStrategy:
|
||||||
if mergeStrategy:
|
parts = _applyMerging(parts, options.mergeStrategy)
|
||||||
parts = _applyMerging(parts, mergeStrategy)
|
|
||||||
|
|
||||||
return ContentExtracted(id=makeId(), parts=parts)
|
return ContentExtracted(id=makeId(), parts=parts)
|
||||||
|
|
||||||
|
|
@ -87,17 +86,17 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
|
||||||
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
|
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
|
||||||
|
|
||||||
|
|
||||||
def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
def _applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
||||||
"""Apply merging strategy to parts with intelligent token-aware merging."""
|
"""Apply merging strategy to parts with intelligent token-aware merging."""
|
||||||
logger.debug(f"_applyMerging called with {len(parts)} parts")
|
logger.debug(f"_applyMerging called with {len(parts)} parts")
|
||||||
|
|
||||||
# Check if intelligent merging is enabled
|
# Check if intelligent merging is enabled
|
||||||
if strategy.get("useIntelligentMerging", False):
|
if strategy.useIntelligentMerging:
|
||||||
model_capabilities = strategy.get("capabilities", {})
|
model_capabilities = strategy.capabilities or {}
|
||||||
subMerger = IntelligentTokenAwareMerger(model_capabilities)
|
subMerger = IntelligentTokenAwareMerger(model_capabilities)
|
||||||
|
|
||||||
# Use intelligent merging for all parts
|
# Use intelligent merging for all parts
|
||||||
merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
|
merged = subMerger.merge_chunks_intelligently(parts, strategy.prompt or "")
|
||||||
|
|
||||||
# Calculate and log optimization stats
|
# Calculate and log optimization stats
|
||||||
stats = subMerger.calculate_optimization_stats(parts, merged)
|
stats = subMerger.calculate_optimization_stats(parts, merged)
|
||||||
|
|
|
||||||
|
|
@ -90,7 +90,7 @@ class ModelSelectionTester:
|
||||||
totalScore = sizeRating + processingModeRating + priorityRating
|
totalScore = sizeRating + processingModeRating + priorityRating
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
|
f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
|
||||||
)
|
)
|
||||||
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
|
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
|
||||||
|
|
||||||
|
|
@ -136,7 +136,7 @@ class ModelSelectionTester:
|
||||||
totalScore = sizeRating + processingModeRating + priorityRating
|
totalScore = sizeRating + processingModeRating + priorityRating
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
|
f" {idx:>2}. {m.displayName} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
|
||||||
)
|
)
|
||||||
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
|
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
|
||||||
|
|
||||||
|
|
@ -365,8 +365,8 @@ class ModelSelectionTester:
|
||||||
)
|
)
|
||||||
|
|
||||||
if failoverModelList:
|
if failoverModelList:
|
||||||
print(f" Selected model: {failoverModelList[0].name}")
|
print(f" Selected model: {failoverModelList[0].displayName}")
|
||||||
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
|
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
|
||||||
else:
|
else:
|
||||||
print(" No suitable models found")
|
print(" No suitable models found")
|
||||||
|
|
||||||
|
|
@ -393,8 +393,8 @@ class ModelSelectionTester:
|
||||||
)
|
)
|
||||||
|
|
||||||
if failoverModelList:
|
if failoverModelList:
|
||||||
print(f" Selected model: {failoverModelList[0].name}")
|
print(f" Selected model: {failoverModelList[0].displayName}")
|
||||||
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
|
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
|
||||||
else:
|
else:
|
||||||
print(" No suitable models found")
|
print(" No suitable models found")
|
||||||
|
|
||||||
|
|
@ -421,8 +421,8 @@ class ModelSelectionTester:
|
||||||
)
|
)
|
||||||
|
|
||||||
if failoverModelList:
|
if failoverModelList:
|
||||||
print(f" Selected model: {failoverModelList[0].name}")
|
print(f" Selected model: {failoverModelList[0].displayName}")
|
||||||
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
|
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
|
||||||
else:
|
else:
|
||||||
print(" No suitable models found")
|
print(" No suitable models found")
|
||||||
|
|
||||||
|
|
@ -449,8 +449,8 @@ class ModelSelectionTester:
|
||||||
)
|
)
|
||||||
|
|
||||||
if failoverModelList:
|
if failoverModelList:
|
||||||
print(f" Selected model: {failoverModelList[0].name}")
|
print(f" Selected model: {failoverModelList[0].displayName}")
|
||||||
print(f" Fallback models: {[m.name for m in failoverModelList[1:3]]}")
|
print(f" Fallback models: {[m.displayName for m in failoverModelList[1:3]]}")
|
||||||
else:
|
else:
|
||||||
print(" No suitable models found")
|
print(" No suitable models found")
|
||||||
|
|
||||||
|
|
@ -479,15 +479,15 @@ class ModelSelectionTester:
|
||||||
print(f" {connector_type}: {len(models)} models")
|
print(f" {connector_type}: {len(models)} models")
|
||||||
for model in models:
|
for model in models:
|
||||||
capabilities = getattr(model, 'capabilities', [])
|
capabilities = getattr(model, 'capabilities', [])
|
||||||
print(f" - {model.name}: {capabilities}")
|
print(f" - {model.displayName}: {capabilities}")
|
||||||
|
|
||||||
# Show operation type support
|
# Show operation type support
|
||||||
print(f"\nOperation type support:")
|
print(f"\nOperation type support:")
|
||||||
for op_type in OperationTypeEnum:
|
for op_type in OperationTypeEnum:
|
||||||
supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and op_type in m.operationTypes]
|
supported_models = [m for m in availableModels if hasattr(m, 'operationTypes') and any(ot.operationType == op_type for ot in m.operationTypes)]
|
||||||
print(f" {op_type.name}: {len(supported_models)} models")
|
print(f" {op_type.name}: {len(supported_models)} models")
|
||||||
if supported_models:
|
if supported_models:
|
||||||
model_names = [m.name for m in supported_models[:3]] # Show first 3 models
|
model_names = [m.displayName for m in supported_models[:3]] # Show first 3 models
|
||||||
print(f" Models: {', '.join(model_names)}")
|
print(f" Models: {', '.join(model_names)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue