gateway/modules/services/serviceExtraction/mainServiceExtraction.py
2025-12-28 23:34:32 +01:00

1360 lines
66 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List, Optional, Union
import uuid
import logging
import time
import asyncio
import base64
import json
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions, PartResult, DocumentIntent
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallResponse, AiCallRequest, AiCallOptions, OperationTypeEnum, AiModelCall
from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.aicore.aicoreModelSelector import modelSelector
logger = logging.getLogger(__name__)
class ExtractionService:
def __init__(self, services: Optional[Any] = None):
self.services = services
self._extractorRegistry = ExtractorRegistry()
self._chunkerRegistry = ChunkerRegistry()
# Ensure connectors are registered
discovered = modelRegistry.discoverConnectors()
for connector in discovered:
modelRegistry.registerConnector(connector)
# Verify required internal model is available (used for pricing in extractContent)
modelDisplayName = "Internal Document Extractor"
model = modelRegistry.getModel(modelDisplayName)
if model is None or model.calculatePriceUsd is None:
raise RuntimeError(f"FATAL: Required internal model '{modelDisplayName}' is not available. Check connector registration.")
def extractContent(
self,
documents: List[ChatDocument],
options: ExtractionOptions,
operationId: Optional[str] = None,
parentOperationId: Optional[str] = None
) -> List[ContentExtracted]:
"""
Extract content from a list of ChatDocument objects.
Args:
documents: List of ChatDocument objects to extract content from
options: Extraction options including maxSize, chunkAllowed, mergeStrategy, etc.
operationId: Optional operation ID for progress logging (parent operation)
parentOperationId: Optional parent operation ID for hierarchical logging
Returns:
List of ContentExtracted objects, one per input document
"""
results: List[ContentExtracted] = []
# Lazy import to avoid circular deps and heavy init at module import
from modules.interfaces.interfaceDbComponentObjects import getInterface
dbInterface = getInterface()
totalDocs = len(documents)
for i, doc in enumerate(documents):
logger.info(f"=== DOCUMENT {i + 1}/{totalDocs}: {doc.fileName} ===")
logger.info(f"Initial MIME type: {doc.mimeType}")
# Create child operation for this document if parent operationId is provided
docOperationId = None
if operationId:
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
docOperationId = f"{operationId}_doc_{i}"
# Use parentOperationId if provided, otherwise use operationId as parent
parentId = parentOperationId if parentOperationId else operationId
self.services.chat.progressLogStart(
docOperationId,
"Extracting Document",
f"Document {i + 1}/{totalDocs}",
doc.fileName[:50] + "..." if len(doc.fileName) > 50 else doc.fileName,
parentOperationId=parentId # Correct parent reference for ChatLog hierarchy
)
# Start timing for this document
startTime = time.time()
try:
if docOperationId:
self.services.chat.progressLogUpdate(docOperationId, 0.1, "Loading document data")
# Resolve raw bytes for this document using interface
documentBytes = dbInterface.getFileData(doc.fileId)
if not documentBytes:
if docOperationId:
self.services.chat.progressLogFinish(docOperationId, False)
raise ValueError(f"No file data found for fileId={doc.fileId}")
if docOperationId:
self.services.chat.progressLogUpdate(docOperationId, 0.2, "Running extraction pipeline")
# Convert ChatDocument to the format expected by runExtraction
documentData = {
"id": doc.id,
"bytes": documentBytes,
"fileName": doc.fileName,
"mimeType": doc.mimeType
}
ec = runExtraction(
extractorRegistry=self._extractorRegistry,
chunkerRegistry=self._chunkerRegistry,
documentBytes=documentData["bytes"],
fileName=documentData["fileName"],
mimeType=documentData["mimeType"],
options=options
)
if docOperationId:
self.services.chat.progressLogUpdate(docOperationId, 0.7, f"Extracted {len(ec.parts)} parts")
# Log content parts metadata
logger.debug(f"Content parts: {len(ec.parts)}")
for j, part in enumerate(ec.parts):
logger.debug(f" Part {j + 1}/{len(ec.parts)}: {part.typeGroup} ({part.mimeType}) - {len(part.data) if part.data else 0} chars")
if part.metadata:
logger.debug(f" Metadata: {part.metadata}")
# Attach complete metadata to parts according to ContentPart Metadaten-Schema
for p in ec.parts:
# Ensure metadata dict exists
if not p.metadata:
p.metadata = {}
# Required metadata fields (from concept)
if "documentId" not in p.metadata:
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
if "documentMimeType" not in p.metadata:
p.metadata["documentMimeType"] = documentData["mimeType"]
if "originalFileName" not in p.metadata:
p.metadata["originalFileName"] = documentData["fileName"]
# ContentFormat: Set based on typeGroup and mimeType
# Default to "extracted" for text content, but can be overridden by caller
if "contentFormat" not in p.metadata:
# Default: extracted text content
p.metadata["contentFormat"] = "extracted"
# Intent: Default to "extract" for extracted content
if "intent" not in p.metadata:
p.metadata["intent"] = "extract"
# ExtractionPrompt: Use from options if available
if "extractionPrompt" not in p.metadata and options and options.prompt:
p.metadata["extractionPrompt"] = options.prompt
# UsageHint: Provide default hint
if "usageHint" not in p.metadata:
p.metadata["usageHint"] = f"Use extracted content from {documentData['fileName']}"
# SourceAction: Mark as from extraction service
if "sourceAction" not in p.metadata:
p.metadata["sourceAction"] = "extraction.extractContent"
# Log chunking information
chunkedParts = [p for p in ec.parts if p.metadata.get("chunk", False)]
if chunkedParts:
logger.debug(f"=== CHUNKING RESULTS ===")
logger.debug(f"Total parts: {len(ec.parts)}")
logger.debug(f"Chunked parts: {len(chunkedParts)}")
for chunk in chunkedParts:
logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})")
else:
logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits")
if docOperationId:
self.services.chat.progressLogUpdate(docOperationId, 0.9, f"Processing complete: {len(ec.parts)} parts extracted")
# Calculate timing and emit stats
endTime = time.time()
processingTime = endTime - startTime
bytesSent = len(documentBytes)
bytesReceived = sum(len(part.data) if part.data else 0 for part in ec.parts)
# Emit stats for extraction operation
# Use internal extraction model for pricing
modelDisplayName = "Internal Document Extractor"
model = modelRegistry.getModel(modelDisplayName)
# Hard fail if model is missing; caller must ensure connectors are registered
if model is None or model.calculatePriceUsd is None:
if docOperationId:
self.services.chat.progressLogFinish(docOperationId, False)
raise RuntimeError(f"Pricing model not available: {modelDisplayName}")
priceUsd = model.calculatePriceUsd(processingTime, bytesSent, bytesReceived)
# Create AiCallResponse with real calculation
# Use model.name for the response (API identifier), not displayName
aiResponse = AiCallResponse(
content="", # No content for extraction stats needed
modelName=model.name,
priceUsd=priceUsd,
processingTime=processingTime,
bytesSent=bytesSent,
bytesReceived=bytesReceived,
errorCount=0
)
self.services.chat.storeWorkflowStat(
self.services.workflow,
aiResponse,
f"extraction.process.{doc.mimeType}"
)
# Write extraction results to debug file
try:
from modules.shared.debugLogger import writeDebugFile
# json is already imported at module level
# Create summary of extraction results for debug
extractionSummary = {
"documentName": doc.fileName,
"documentMimeType": doc.mimeType,
"partsCount": len(ec.parts),
"parts": []
}
for part in ec.parts:
partSummary = {
"typeGroup": part.typeGroup,
"mimeType": part.mimeType,
"label": part.label,
"dataLength": len(part.data) if part.data else 0,
"metadata": part.metadata
}
# Include data preview for small parts (first 500 chars)
if part.data and len(part.data) <= 500:
partSummary["dataPreview"] = part.data[:500]
elif part.data:
partSummary["dataPreview"] = f"[Large data: {len(part.data)} chars - truncated]"
extractionSummary["parts"].append(partSummary)
writeDebugFile(json.dumps(extractionSummary, indent=2, ensure_ascii=False), f"extraction_result_{doc.fileName}.txt")
except Exception as e:
logger.debug(f"Failed to write extraction debug file: {str(e)}")
results.append(ec)
# Finish document operation successfully
if docOperationId:
self.services.chat.progressLogFinish(docOperationId, True)
except Exception as e:
logger.error(f"Error extracting content from document {i + 1}/{totalDocs} ({doc.fileName}): {str(e)}")
if docOperationId:
try:
self.services.chat.progressLogFinish(docOperationId, False)
except:
pass # Don't fail on progress logging errors
# Continue with next document instead of failing completely
# This allows parallel processing to continue even if one document fails
continue
return results
def mergeAiResults(
self,
extractedContent: List[ContentExtracted],
aiResults: List[str],
strategy: MergeStrategy
) -> ContentExtracted:
"""
Merge AI results from chunked content back into a single ContentExtracted.
Args:
extractedContent: List of ContentExtracted objects that were processed
aiResults: List of AI response strings, one per chunk
strategy: Merge strategy configuration (dict or MergeStrategy object)
Returns:
Single ContentExtracted with merged AI results
"""
logger.debug(f"=== MERGING AI RESULTS ===")
logger.debug(f"Extracted content: {len(extractedContent)} documents")
logger.debug(f"AI results: {len(aiResults)} responses")
logger.debug(f"Merge strategy: {strategy.mergeType}")
mergeStrategy = strategy
# Collect all parts from all extracted content
allParts: List[ContentPart] = []
for ec in extractedContent:
allParts.extend(ec.parts)
logger.debug(f"Total original parts: {len(allParts)}")
# Create AI result parts
aiResultParts: List[ContentPart] = []
for i, aiResult in enumerate(aiResults):
aiPart = ContentPart(
id=f"ai_result_{i}",
parentId=None, # Will be set based on strategy
label="ai_result",
typeGroup="text",
mimeType="text/plain",
data=aiResult,
metadata={
"aiResult": True,
"order": i,
"size": len(aiResult.encode('utf-8'))
}
)
aiResultParts.append(aiPart)
logger.debug(f"Created {len(aiResultParts)} AI result parts")
# Apply merging strategy
if mergeStrategy.mergeType == "concatenate":
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
elif mergeStrategy.mergeType == "hierarchical":
mergedParts = self._mergeHierarchical(allParts, aiResultParts, mergeStrategy)
elif mergeStrategy.mergeType == "intelligent":
mergedParts = self._mergeIntelligent(allParts, aiResultParts, mergeStrategy)
else:
# Default to concatenate
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
# Create final ContentExtracted
mergedContent = ContentExtracted(
id=f"merged_{uuid.uuid4()}",
parts=mergedParts
)
logger.debug(f"=== MERGE COMPLETED ===")
logger.debug(f"Final merged parts: {len(mergedParts)}")
logger.debug(f"Merged content ID: {mergedContent.id}")
return mergedContent
def _mergeConcatenate(
self,
originalParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Merge parts by simple concatenation."""
mergedParts = []
# Add original parts (filtered if needed)
for part in originalParts:
if strategy.preserveChunks or not part.metadata.get("chunk", False):
mergedParts.append(part)
# Add AI results
if aiResultParts:
# Group AI results by parentId if available
aiResultsByParent = {}
for aiPart in aiResultParts:
parentId = aiPart.parentId or "root"
if parentId not in aiResultsByParent:
aiResultsByParent[parentId] = []
aiResultsByParent[parentId].append(aiPart)
# Merge AI results for each parent
for parentId, aiParts in aiResultsByParent.items():
if len(aiParts) == 1:
mergedParts.append(aiParts[0])
else:
# Concatenate multiple AI results for same parent
combinedData = strategy.chunkSeparator.join([p.data for p in aiParts])
combinedPart = ContentPart(
id=f"merged_ai_{parentId}",
parentId=parentId if parentId != "root" else None,
label="merged_ai_result",
typeGroup="text",
mimeType="text/plain",
data=combinedData,
metadata={
"aiResult": True,
"merged": True,
"sourceCount": len(aiParts),
"size": len(combinedData.encode('utf-8'))
}
)
mergedParts.append(combinedPart)
return mergedParts
def _mergeHierarchical(
self,
originalParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Merge parts hierarchically based on parentId relationships."""
# Group parts by parentId
partsByParent = {}
for part in originalParts:
parentId = part.parentId or "root"
if parentId not in partsByParent:
partsByParent[parentId] = []
partsByParent[parentId].append(part)
# Group AI results by parentId
aiResultsByParent = {}
for aiPart in aiResultParts:
parentId = aiPart.parentId or "root"
if parentId not in aiResultsByParent:
aiResultsByParent[parentId] = []
aiResultsByParent[parentId].append(aiPart)
mergedParts = []
# Process each parent group
for parentId in set(list(partsByParent.keys()) + list(aiResultsByParent.keys())):
originalGroup = partsByParent.get(parentId, [])
aiGroup = aiResultsByParent.get(parentId, [])
# Add original parts
mergedParts.extend(originalGroup)
# Add AI results for this parent
if aiGroup:
if len(aiGroup) == 1:
mergedParts.append(aiGroup[0])
else:
# Merge multiple AI results
combinedData = strategy.chunkSeparator.join([p.data for p in aiGroup])
combinedPart = ContentPart(
id=f"hierarchical_ai_{parentId}",
parentId=parentId if parentId != "root" else None,
label="hierarchical_ai_result",
typeGroup="text",
mimeType="text/plain",
data=combinedData,
metadata={
"aiResult": True,
"hierarchical": True,
"sourceCount": len(aiGroup),
"size": len(combinedData.encode('utf-8'))
}
)
mergedParts.append(combinedPart)
return mergedParts
def _mergeIntelligent(
self,
originalParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Merge parts using intelligent strategies based on content type."""
mergedParts = []
# Group by typeGroup for intelligent merging
partsByType = {}
for part in originalParts:
typeGroup = part.typeGroup
if typeGroup not in partsByType:
partsByType[typeGroup] = []
partsByType[typeGroup].append(part)
# Process each type group
for typeGroup, parts in partsByType.items():
if typeGroup == "text":
mergedParts.extend(self._mergeTextIntelligent(parts, aiResultParts, strategy))
elif typeGroup == "table":
mergedParts.extend(self._mergeTableIntelligent(parts, aiResultParts, strategy))
elif typeGroup == "structure":
mergedParts.extend(self._mergeStructureIntelligent(parts, aiResultParts, strategy))
else:
# Default handling for other types
mergedParts.extend(parts)
# Add any remaining AI results that weren't merged
for aiPart in aiResultParts:
if not any(p.id == aiPart.id for p in mergedParts):
mergedParts.append(aiPart)
return mergedParts
def _mergeTextIntelligent(
self,
textParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Intelligent merging for text content."""
# For now, use concatenate strategy
# This could be enhanced with semantic analysis, summarization, etc.
return self._mergeConcatenate(textParts, aiResultParts, strategy)
def _mergeTableIntelligent(
self,
tableParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Intelligent merging for table content."""
# For now, use concatenate strategy
# This could be enhanced with table merging logic
return self._mergeConcatenate(tableParts, aiResultParts, strategy)
def _mergeStructureIntelligent(
self,
structureParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Intelligent merging for structured content."""
# For now, use concatenate strategy
# This could be enhanced with structure-aware merging
return self._mergeConcatenate(structureParts, aiResultParts, strategy)
async def processDocumentsPerChunk(
self,
documents: List[ChatDocument],
prompt: str,
aiObjects: Any,
options: Optional[AiCallOptions] = None,
operationId: Optional[str] = None,
parentOperationId: Optional[str] = None
) -> str:
"""
Process documents with model-aware chunking and merge results.
NEW: Uses model-aware chunking in AI call phase instead of extraction phase.
Args:
documents: List of ChatDocument objects to process
prompt: AI prompt for processing
aiObjects: AiObjects instance for making AI calls
options: AI call options
operationId: Optional operation ID for progress tracking
parentOperationId: Optional parent operation ID for hierarchical logging
Returns:
Merged AI results as string with preserved document structure
"""
if not documents:
return ""
# Create operationId if not provided
if not operationId:
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
operationId = f"ai_text_extract_{workflowId}_{int(time.time())}"
self.services.chat.progressLogStart(
operationId,
"AI Text Extract",
"Document Processing",
f"Processing {len(documents)} documents",
parentOperationId=parentOperationId # Use parentOperationId if provided
)
try:
# Build extraction options using Pydantic model
mergeStrategy = MergeStrategy(
useIntelligentMerging=True,
prompt=prompt,
groupBy="typeGroup",
orderBy="id",
mergeType="concatenate"
)
extractionOptions = ExtractionOptions(
prompt=prompt,
processDocumentsIndividually=True,
mergeStrategy=mergeStrategy
)
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars")
# Extract content WITHOUT chunking
if operationId:
self.services.chat.progressLogUpdate(operationId, 0.1, f"Extracting content from {len(documents)} documents")
# Pass operationId as parentOperationId for hierarchical logging
# Correct hierarchy: parentOperationId -> operationId -> docOperationId
extractionResult = self.extractContent(documents, extractionOptions, operationId=operationId, parentOperationId=operationId)
if not isinstance(extractionResult, list):
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return "[Error: No extraction results]"
# Process parts (not chunks) with model-aware AI calls
if operationId:
self.services.chat.progressLogUpdate(operationId, 0.3, f"Processing {len(extractionResult)} extracted content parts")
# Use operationId as parentOperationId for child operations
# Correct hierarchy: parentOperationId -> operationId -> partOperationId
processParentOperationId = operationId
partResults = await self._processPartsWithMapping(extractionResult, prompt, aiObjects, options, operationId, processParentOperationId)
# Merge results using existing merging system
if operationId:
self.services.chat.progressLogUpdate(operationId, 0.9, f"Merging {len(partResults)} part results")
mergedContent = self.mergePartResults(partResults, options)
# Save merged extraction content to debug
self.services.utils.writeDebugFile(mergedContent or '', "extraction_merged_text")
if operationId:
self.services.chat.progressLogFinish(operationId, True)
return mergedContent
except Exception as e:
logger.error(f"Error in processDocumentsPerChunk: {str(e)}")
if operationId:
self.services.chat.progressLogFinish(operationId, False)
raise
async def _processPartsWithMapping(
self,
extractionResult: List[ContentExtracted],
prompt: str,
aiObjects: Any,
options: Optional[AiCallOptions] = None,
operationId: Optional[str] = None,
parentOperationId: Optional[str] = None
) -> List[PartResult]:
"""Process content parts with model-aware chunking and proper mapping."""
# Collect all parts that need processing
partsToProcess = []
partIndex = 0
for ec in extractionResult:
for part in ec.parts:
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
# Skip empty container parts
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
logger.debug(f"Skipping empty container part: mimeType={part.mimeType}")
continue
partsToProcess.append({
'part': part,
'part_index': partIndex,
'document_id': ec.id
})
partIndex += 1
logger.info(f"Processing {len(partsToProcess)} parts with model-aware chunking")
totalParts = len(partsToProcess)
# Process parts in parallel
processedCount = [0] # Use list to allow modification in nested function
async def processSinglePart(partInfo: Dict) -> PartResult:
part = partInfo['part']
part_index = partInfo['part_index']
documentId = partInfo['document_id']
start_time = time.time()
# Create separate operation for each part with parent reference
partOperationId = None
if operationId:
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
partOperationId = f"{operationId}_part_{part_index}"
self.services.chat.progressLogStart(
partOperationId,
"Content Processing",
f"Part {part_index + 1}",
f"Type: {part.typeGroup}",
parentOperationId=parentOperationId
)
try:
# Create AI call request with content part
request = AiCallRequest(
prompt=prompt,
context="", # Context is in the content part
options=options,
contentParts=[part] # Pass as list for unified processing
)
# Update progress - initiating
if partOperationId:
self.services.chat.progressLogUpdate(partOperationId, 0.3, "Initiating")
# Call AI with model-aware chunking (no progress callback - handled by parent operation)
response = await aiObjects.call(request)
# Update progress - completed
if partOperationId:
self.services.chat.progressLogUpdate(partOperationId, 0.9, "Completed")
self.services.chat.progressLogFinish(partOperationId, True)
processing_time = time.time() - start_time
return PartResult(
originalPart=part,
aiResult=response.content,
partIndex=part_index,
documentId=documentId,
processingTime=processing_time,
metadata={
"success": True,
"partSize": len(part.data) if part.data else 0,
"resultSize": len(response.content),
"typeGroup": part.typeGroup,
"modelName": response.modelName,
"priceUsd": response.priceUsd
}
)
except Exception as e:
processing_time = time.time() - start_time
logger.warning(f"Error processing part {part_index}: {str(e)}")
return PartResult(
originalPart=part,
aiResult=f"[Error processing part: {str(e)}]",
partIndex=part_index,
documentId=documentId,
processingTime=processing_time,
metadata={
"success": False,
"error": str(e),
"partSize": len(part.data) if part.data else 0,
"typeGroup": part.typeGroup
}
)
# Process parts with concurrency control
maxConcurrent = 5
if options and hasattr(options, 'maxConcurrentParts'):
maxConcurrent = options.maxConcurrentParts
semaphore = asyncio.Semaphore(maxConcurrent)
async def processWithSemaphore(partInfo):
async with semaphore:
return await processSinglePart(partInfo)
tasks = [processWithSemaphore(part_info) for part_info in partsToProcess]
partResults = await asyncio.gather(*tasks, return_exceptions=True)
# Handle exceptions
processedResults = []
for i, result in enumerate(partResults):
if isinstance(result, Exception):
part_info = partsToProcess[i]
processedResults.append(PartResult(
originalPart=part_info['part'],
aiResult=f"[Error in parallel processing: {str(result)}]",
partIndex=part_info['part_index'],
documentId=part_info['document_id'],
processingTime=0.0,
metadata={"success": False, "error": str(result)}
))
elif result is not None:
processedResults.append(result)
logger.info(f"Completed processing {len(processedResults)} parts")
return processedResults
def _convertToContentParts(
self, partResults: Union[List[PartResult], List[AiCallResponse]]
) -> List[ContentPart]:
"""Convert part results to ContentParts (internal helper for consolidation).
Handles both PartResult (from extraction workflow) and AiCallResponse (from content parts processing).
"""
content_parts = []
if not partResults:
return content_parts
# Detect input type and convert accordingly
if isinstance(partResults[0], PartResult):
# Existing logic for PartResult (from processDocumentsPerChunk)
# Phase 7: Add originalIndex for explicit ordering
for i, part_result in enumerate(partResults):
content_part = ContentPart(
id=part_result.originalPart.id,
parentId=part_result.originalPart.parentId,
label=part_result.originalPart.label,
typeGroup=part_result.originalPart.typeGroup, # Use original typeGroup
mimeType=part_result.originalPart.mimeType,
data=part_result.aiResult, # Use AI result as data
metadata={
**part_result.originalPart.metadata,
"aiResult": True,
"originalIndex": i, # Phase 7: Explicit order index
"partIndex": part_result.partIndex,
"processingOrder": i, # Phase 7: Processing order
"documentId": part_result.documentId,
"processingTime": part_result.processingTime,
"success": part_result.metadata.get("success", False)
}
)
content_parts.append(content_part)
elif isinstance(partResults[0], AiCallResponse):
# Logic from interfaceAiObjects (from content parts processing)
# Phase 7: Add originalIndex for explicit ordering
for i, result in enumerate(partResults):
if result.content:
content_part = ContentPart(
id=str(uuid.uuid4()),
parentId=None,
label=f"ai_result_{i}",
typeGroup="text", # Default to text for AI results
mimeType="text/plain",
data=result.content,
metadata={
"aiResult": True,
"originalIndex": i, # Phase 7: Explicit order index
"processingOrder": i, # Phase 7: Processing order
"modelName": result.modelName,
"priceUsd": result.priceUsd,
"processingTime": result.processingTime,
"bytesSent": result.bytesSent,
"bytesReceived": result.bytesReceived
}
)
content_parts.append(content_part)
return content_parts
def mergePartResults(
self,
partResults: Union[List[PartResult], List[AiCallResponse]],
options: Optional[AiCallOptions] = None
) -> str:
"""Unified merge for both PartResult and AiCallResponse.
Consolidated from both interfaceAiObjects.py and existing serviceExtraction method.
"""
if not partResults:
return ""
# Convert to ContentParts using unified helper
content_parts = self._convertToContentParts(partResults)
# Determine merge strategy based on input type
if isinstance(partResults[0], PartResult):
# Phase 7: Use originalIndex for explicit ordering
# Use strategy for extraction workflow (group by document, order by originalIndex)
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="documentId", # Group by document
orderBy="originalIndex", # Phase 7: Order by originalIndex instead of partIndex
mergeType="concatenate"
)
else:
# Default strategy for content parts workflow
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="typeGroup",
orderBy="id",
mergeType="concatenate"
)
# Apply merging
merged_parts = applyMerging(content_parts, merge_strategy)
# Phase 6: Enhanced format with metadata preservation
# CRITICAL: Don't add SOURCE markers for internal use - metadata is already preserved in ContentPart objects
# SOURCE markers should ONLY be added when content is returned directly to user for display/debugging
# For extraction content used in generation pipelines, metadata is in ContentPart.metadata, not in text markers
# Check if this is a generation response by looking at operationType or content structure
isGenerationResponse = False
if options and hasattr(options, 'operationType'):
# Generation responses use DATA_GENERATE operation type
from modules.datamodels.datamodelAi import OperationTypeEnum
isGenerationResponse = options.operationType == OperationTypeEnum.DATA_GENERATE
# Also check if content looks like JSON (starts with { or [)
if not isGenerationResponse and merged_parts:
firstPartData = merged_parts[0].data if merged_parts[0].data else ""
if isinstance(firstPartData, str) and firstPartData.strip().startswith(('{', '[')):
# Check if it's a complete JSON structure (not extracted content)
# Generation responses are complete JSON, extraction responses are text content
try:
# json is already imported at module level
json.loads(firstPartData.strip())
# If it parses as JSON and has "documents" key, it's likely a generation response
parsed = json.loads(firstPartData.strip())
if isinstance(parsed, dict) and "documents" in parsed:
isGenerationResponse = True
except:
pass
# ROOT CAUSE FIX: Never add SOURCE markers - metadata is preserved in ContentPart.metadata
# SOURCE markers pollute content and cause issues when content is used in generation pipelines
# If traceability is needed, use ContentPart.metadata fields (documentId, documentMimeType, label, etc.)
content_sections = []
for part in merged_parts:
# Always return clean content without SOURCE markers
# Metadata is available in ContentPart.metadata for traceability
content_sections.append(part.data if part.data else "")
final_content = "\n\n".join(content_sections)
logger.info(f"Merged {len(partResults)} parts using unified merging system with metadata preservation (generationResponse={isGenerationResponse})")
return final_content.strip()
async def chunkContentPartForAi(self, contentPart, model, options, prompt: str = "") -> List[Dict[str, Any]]:
"""Chunk a content part based on model capabilities, accounting for prompt, system message overhead, and maxTokens output.
Moved from interfaceAiObjects.py - model-aware chunking for AI processing.
Complementary to existing size-based chunking in extraction pipeline.
"""
# Calculate model-specific chunk sizes
modelContextTokens = model.contextLength # Total context in tokens
modelMaxOutputTokens = model.maxTokens # Maximum output tokens
# CRITICAL: Use same conservative token factor as in processContentPartWithFallback
# Real-world observation: Our calculation says 94k tokens, but API says 217k tokens (2.3x difference!)
TOKEN_SAFETY_FACTOR = 2.2 # Conservative: accounts for JSON tokenization and API overhead
# Reserve tokens for:
# 1. Prompt (user message) - use conservative factor
promptSize = len(prompt.encode('utf-8')) if prompt else 0
promptTokens = promptSize / TOKEN_SAFETY_FACTOR
# 2. System message wrapper ("Context from documents:\n")
systemMessageTokens = 10 # ~40 bytes = 10 tokens
# 3. Max output tokens (model will reserve space for completion)
outputTokens = modelMaxOutputTokens
# 4. JSON structure and message overhead (~100 tokens)
messageOverheadTokens = 100
# Total reserved tokens = input overhead + output reservation
totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens
# Available tokens for content = context length - reserved tokens
# Use 60% of available (same conservative margin as in processContentPartWithFallback)
availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.60)
# Ensure we have at least some space
if availableContentTokens < 100:
logger.warning(f"Very limited space for content: {availableContentTokens} tokens available. Model: {model.name}, contextLength: {modelContextTokens}, maxTokens: {modelMaxOutputTokens}, prompt: {promptTokens:.0f} tokens")
availableContentTokens = max(100, int(modelContextTokens * 0.1)) # Fallback to 10% of context
# Convert tokens to bytes using conservative factor (reverse: bytes = tokens * factor)
availableContentBytes = int(availableContentTokens * TOKEN_SAFETY_FACTOR)
logger.info(f"Chunking calculation for {model.name}: contextLength={modelContextTokens} tokens, maxTokens={modelMaxOutputTokens} tokens, prompt={promptTokens:.0f} tokens est., reserved={totalReservedTokens:.0f} tokens est., available={availableContentTokens} tokens est. ({availableContentBytes} bytes), factor={TOKEN_SAFETY_FACTOR}")
# Use 50% of available content bytes for text chunks (very conservative to ensure chunks fit)
# This ensures that even with token counting inaccuracies, chunks will fit
textChunkSize = int(availableContentBytes * 0.5)
structureChunkSize = int(availableContentBytes * 0.5) # CRITICAL: Also set for StructureChunker (JSON content)
tableChunkSize = int(availableContentBytes * 0.5) # Also set for TableChunker
imageChunkSize = int(availableContentBytes * 0.6) # 60% for image chunks
# Build chunking options - include ALL chunk size options for different chunkers
chunkingOptions = {
"textChunkSize": textChunkSize,
"structureChunkSize": structureChunkSize, # CRITICAL: Required for StructureChunker (JSON)
"tableChunkSize": tableChunkSize, # Required for TableChunker
"imageChunkSize": imageChunkSize,
"maxSize": availableContentBytes,
"chunkAllowed": True
}
logger.info(f"Chunking options: textChunkSize={textChunkSize} bytes, structureChunkSize={structureChunkSize} bytes, tableChunkSize={tableChunkSize} bytes, imageChunkSize={imageChunkSize} bytes, contentPartSize={len(contentPart.data.encode('utf-8')) if contentPart.data else 0} bytes")
# Get appropriate chunker (uses existing ChunkerRegistry ✅)
chunker = self._chunkerRegistry.resolve(contentPart.typeGroup)
if not chunker:
logger.warning(f"No chunker found for typeGroup: {contentPart.typeGroup}")
return []
# Chunk the content part
try:
contentSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
logger.info(f"Chunking {contentPart.typeGroup} part: contentSize={contentSize} bytes, textChunkSize={textChunkSize} bytes, structureChunkSize={structureChunkSize} bytes")
chunks = chunker.chunk(contentPart, chunkingOptions)
logger.info(f"Created {len(chunks)} chunks for {contentPart.typeGroup} part (contentSize={contentSize} bytes)")
if chunks:
for i, chunk in enumerate(chunks):
chunkSize = len(chunk.get('data', '').encode('utf-8')) if chunk.get('data') else 0
logger.info(f" Chunk {i+1}/{len(chunks)}: {chunkSize} bytes")
return chunks
except Exception as e:
logger.error(f"Chunking failed for {contentPart.typeGroup}: {str(e)}")
return []
async def processContentPartWithFallback(self, contentPart, prompt: str, options, failoverModelList, aiObjects, progressCallback=None) -> AiCallResponse:
"""Process a single content part with model-aware chunking and fallback.
Moved from interfaceAiObjects.py - orchestrates chunking and merging.
Calls aiObjects._callWithModel() for actual AI calls.
"""
lastError = None
# Check if this is an image - Vision models need special handling
isImage = (contentPart.typeGroup == "image") or (contentPart.mimeType and contentPart.mimeType.startswith("image/"))
# Determine the correct operation type based on content type
actualOperationType = options.operationType
if isImage:
actualOperationType = OperationTypeEnum.IMAGE_ANALYSE
# Get vision-capable models for images
availableModels = modelRegistry.getAvailableModels()
visionFailoverList = modelSelector.getFailoverModelList(prompt, "", AiCallOptions(operationType=actualOperationType), availableModels)
if visionFailoverList:
logger.debug(f"Using {len(visionFailoverList)} vision-capable models for image processing")
failoverModelList = visionFailoverList
for attempt, model in enumerate(failoverModelList):
try:
logger.info(f"Processing content part with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
# Special handling for images with Vision models
if isImage and hasattr(model, 'functionCall'):
try:
if not contentPart.data:
raise ValueError("Image content part has no data")
mimeType = contentPart.mimeType or "image/jpeg"
if not mimeType.startswith("image/"):
raise ValueError(f"Invalid mimeType for image: {mimeType}")
# Prepare base64 data
if isinstance(contentPart.data, str):
try:
base64.b64decode(contentPart.data, validate=True)
base64Data = contentPart.data
except Exception as e:
raise ValueError(f"Invalid base64 data in contentPart: {str(e)}")
elif isinstance(contentPart.data, bytes):
base64Data = base64.b64encode(contentPart.data).decode('utf-8')
else:
raise ValueError(f"Unsupported data type for image: {type(contentPart.data)}")
imageDataUrl = f"data:{mimeType};base64,{base64Data}"
modelCall = AiModelCall(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt or ""},
{
"type": "image_url",
"image_url": {"url": imageDataUrl}
}
]
}
],
model=model,
options=AiCallOptions(operationType=actualOperationType)
)
modelResponse = await model.functionCall(modelCall)
if not modelResponse.success:
raise ValueError(f"Model call failed: {modelResponse.error}")
logger.info(f"✅ Image content part processed successfully with model: {model.name}")
processingTime = getattr(modelResponse, 'processingTime', None) or 0.0
return AiCallResponse(
content=modelResponse.content,
modelName=model.name,
priceUsd=0.0,
processingTime=processingTime,
bytesSent=0,
bytesReceived=0,
errorCount=0
)
except Exception as e:
lastError = e
logger.warning(f"❌ Image processing failed with model {model.name}: {str(e)}")
if attempt < len(failoverModelList) - 1:
logger.info(f"🔄 Trying next fallback model for image processing...")
continue
else:
logger.error(f"💥 All {len(failoverModelList)} models failed for image processing")
raise
# For non-image parts, check if part fits in model context
partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
modelContextTokens = model.contextLength
modelMaxOutputTokens = model.maxTokens
promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0
systemMessageTokens = 10
outputTokens = modelMaxOutputTokens
messageOverheadTokens = 100
totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens
availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)
if availableContentTokens < 100:
availableContentTokens = max(100, int(modelContextTokens * 0.1))
availableContentBytes = availableContentTokens * 4
# Also check prompt size - prompt + content together must fit
promptSize = len(prompt.encode('utf-8')) if prompt else 0
# CRITICAL: Token counting approximation is VERY inaccurate for JSON/content
# Real-world observation: Our calculation says 94k tokens, but API says 217k tokens (2.3x difference!)
# This happens because:
# 1. JSON/structured content tokenizes differently (more tokens per byte)
# 2. API has message structure overhead (system prompts, message wrappers)
# 3. Tokenizer differences between our approximation and actual API tokenizer
# Use conservative factor: 1 token ≈ 2.2 bytes (instead of 4) to account for these differences
TOKEN_SAFETY_FACTOR = 2.2 # Conservative: accounts for JSON tokenization and API overhead
promptTokens = promptSize / TOKEN_SAFETY_FACTOR
contentTokens = partSize / TOKEN_SAFETY_FACTOR
totalTokens = promptTokens + contentTokens
# CRITICAL: Use very conservative margin (60%) because:
# 1. Token counting approximation is inaccurate - real tokens can be 2-3x more
# 2. API has additional overhead (message structure, system prompts, etc.)
# 3. Anthropic API is strict about the 200k limit
# 4. We've seen cases where our calculation says "fits" but API says "too long"
maxTotalTokens = int(modelContextTokens * 0.60)
logger.info(f"Size check for {model.name}: partSize={partSize} bytes ({contentTokens:.0f} tokens est.), promptSize={promptSize} bytes ({promptTokens:.0f} tokens est.), total={totalTokens:.0f} tokens est., modelContext={modelContextTokens} tokens, maxTotal={maxTotalTokens} tokens (60% margin, conservative factor={TOKEN_SAFETY_FACTOR})")
# CRITICAL: Always check totalTokens first - if prompt + content exceeds limit, MUST chunk
# Token counting approximation may differ significantly from API, so use very conservative margin
if totalTokens > maxTotalTokens:
logger.warning(f"⚠️ Total tokens ({totalTokens:.0f} est.) exceed model limit ({maxTotalTokens}), chunking required. Prompt: {promptTokens:.0f} tokens est., Content: {contentTokens:.0f} tokens est.")
elif partSize > availableContentBytes:
logger.warning(f"⚠️ Content part ({contentTokens:.0f} tokens est.) exceeds available space ({availableContentBytes/TOKEN_SAFETY_FACTOR:.0f} tokens est.), chunking required")
# If either condition fails, chunk the content
# CRITICAL: IMAGE_GENERATE operations should NOT use chunking - they generate images from prompts, not process content chunks
if (totalTokens > maxTotalTokens or partSize > availableContentBytes) and options.operationType != OperationTypeEnum.IMAGE_GENERATE:
# Part too large or total exceeds limit - chunk it (but not for image generation)
chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
if not chunks:
raise ValueError(f"Failed to chunk content part for model {model.name}")
logger.info(f"Starting to process {len(chunks)} chunks with model {model.name}")
if progressCallback:
progressCallback(0.0, f"Starting to process {len(chunks)} chunks")
chunkResults = []
for idx, chunk in enumerate(chunks):
chunkNum = idx + 1
chunkData = chunk.get('data', '')
logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
if progressCallback:
progressCallback(chunkNum / len(chunks), f"Processing chunk {chunkNum}/{len(chunks)}")
try:
chunkResponse = await aiObjects._callWithModel(model, prompt, chunkData, options)
chunkResults.append(chunkResponse)
except Exception as chunkError:
logger.error(f"Error processing chunk {chunkNum}/{len(chunks)}: {str(chunkError)}")
# Continue with other chunks even if one fails
continue
# Merge chunk results
if not chunkResults:
raise ValueError(f"All chunks failed for content part")
mergedContent = self.mergePartResults(chunkResults, options)
return AiCallResponse(
content=mergedContent,
modelName=model.name,
priceUsd=sum(r.priceUsd for r in chunkResults),
processingTime=sum(r.processingTime for r in chunkResults),
bytesSent=sum(r.bytesSent for r in chunkResults),
bytesReceived=sum(r.bytesReceived for r in chunkResults),
errorCount=sum(r.errorCount for r in chunkResults)
)
else:
# Part fits - call AI directly via aiObjects interface
logger.info(f"✅ Content part fits within model limits, processing directly")
response = await aiObjects._callWithModel(model, prompt, contentPart.data, options)
logger.info(f"✅ Content part processed successfully with model: {model.name}")
return response
chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
if not chunks:
raise ValueError(f"Failed to chunk content part for model {model.name}")
logger.info(f"Starting to process {len(chunks)} chunks with model {model.name}")
if progressCallback:
progressCallback(0.0, f"Starting to process {len(chunks)} chunks")
chunkResults = []
for idx, chunk in enumerate(chunks):
chunkNum = idx + 1
chunkData = chunk.get('data', '')
logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
if progressCallback:
progressCallback(chunkNum / len(chunks), f"Processing chunk {chunkNum}/{len(chunks)}")
try:
chunkResponse = await aiObjects._callWithModel(model, prompt, chunkData, options)
chunkResults.append(chunkResponse)
logger.info(f"✅ Chunk {chunkNum}/{len(chunks)} processed successfully")
if progressCallback:
progressCallback(chunkNum / len(chunks), f"Chunk {chunkNum}/{len(chunks)} processed")
except Exception as e:
logger.error(f"❌ Error processing chunk {chunkNum}/{len(chunks)}: {str(e)}")
raise
# Merge chunk results using unified mergePartResults
mergedContent = self.mergePartResults(chunkResults, options)
logger.info(f"✅ Content part chunked and processed with model: {model.name} ({len(chunks)} chunks)")
return AiCallResponse(
content=mergedContent,
modelName=model.name,
priceUsd=sum(r.priceUsd for r in chunkResults),
processingTime=sum(r.processingTime for r in chunkResults),
bytesSent=sum(r.bytesSent for r in chunkResults),
bytesReceived=sum(r.bytesReceived for r in chunkResults),
errorCount=sum(r.errorCount for r in chunkResults)
)
except Exception as e:
lastError = e
error_msg = str(e) if str(e) else f"{type(e).__name__}"
logger.warning(f"❌ Model {model.name} failed for content part: {error_msg}", exc_info=True)
if attempt < len(failoverModelList) - 1:
logger.info(f"🔄 Trying next failover model...")
continue
else:
logger.error(f"💥 All {len(failoverModelList)} models failed for content part")
break
# All models failed
return self._createErrorResponse(f"All models failed: {str(lastError)}", 0, 0)
def _createErrorResponse(self, errorMsg: str, inputBytes: int, outputBytes: int) -> AiCallResponse:
"""Create an error response."""
return AiCallResponse(
content=errorMsg,
modelName="error",
priceUsd=0.0,
processingTime=0.0,
bytesSent=inputBytes,
bytesReceived=outputBytes,
errorCount=1
)
async def processContentPartsWithAi(
self,
request: AiCallRequest,
aiObjects, # Pass interface for AI calls
progressCallback=None
) -> AiCallResponse:
"""Process content parts with model-aware chunking and AI calls.
Moved from interfaceAiObjects.callWithContentParts() - entry point for content parts processing.
"""
prompt = request.prompt
options = request.options
contentParts = request.contentParts
# Get failover models
availableModels = modelRegistry.getAvailableModels()
failoverModelList = modelSelector.getFailoverModelList(prompt, "", options, availableModels)
if not failoverModelList:
return self._createErrorResponse("No suitable models found", 0, 0)
# Process each content part
allResults = []
for contentPart in contentParts:
partResult = await self.processContentPartWithFallback(
contentPart, prompt, options, failoverModelList, aiObjects, progressCallback
)
allResults.append(partResult)
# Merge all results using unified mergePartResults
mergedContent = self.mergePartResults(allResults)
return AiCallResponse(
content=mergedContent,
modelName="multiple",
priceUsd=sum(r.priceUsd for r in allResults),
processingTime=sum(r.processingTime for r in allResults),
bytesSent=sum(r.bytesSent for r in allResults),
bytesReceived=sum(r.bytesReceived for r in allResults),
errorCount=sum(r.errorCount for r in allResults)
)
# Module-level function for use by subPipeline and ExtractionService
def applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""Apply merging strategy to parts with intelligent token-aware merging.
Moved from interfaceAiObjects.py to resolve dependency violations.
Can be used as module-level function or called from ExtractionService methods.
"""
logger.debug(f"applyMerging called with {len(parts)} parts")
# Import merging dependencies (now local imports ✅)
from .merging.mergerText import TextMerger
from .merging.mergerTable import TableMerger
from .merging.mergerDefault import DefaultMerger
from .subMerger import IntelligentTokenAwareMerger
# Check if intelligent merging is enabled
if strategy.useIntelligentMerging:
modelCapabilities = strategy.capabilities or {}
subMerger = IntelligentTokenAwareMerger(modelCapabilities)
# Use intelligent merging for all parts
merged = subMerger.mergeChunksIntelligently(parts, strategy.prompt or "")
# Calculate and log optimization stats
stats = subMerger.calculateOptimizationStats(parts, merged)
logger.info(f"🧠 Intelligent merging stats: {stats}")
logger.debug(f"Intelligent merging: {stats['original_ai_calls']}{stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
return merged
# Fallback to traditional merging
textMerger = TextMerger()
tableMerger = TableMerger()
defaultMerger = DefaultMerger()
# Group by typeGroup
textParts = [p for p in parts if p.typeGroup == "text"]
tableParts = [p for p in parts if p.typeGroup == "table"]
structureParts = [p for p in parts if p.typeGroup == "structure"]
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
merged: List[ContentPart] = []
if textParts:
textMerged = textMerger.merge(textParts, strategy)
logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
merged.extend(textMerged)
if tableParts:
tableMerged = tableMerger.merge(tableParts, strategy)
logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
merged.extend(tableMerged)
if structureParts:
# For now, treat structure like text
structureMerged = textMerger.merge(structureParts, strategy)
logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
merged.extend(structureMerged)
if otherParts:
otherMerged = defaultMerger.merge(otherParts, strategy)
logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
merged.extend(otherMerged)
logger.debug(f"applyMerging returning {len(merged)} parts")
return merged