gateway/modules/services/serviceExtraction/mainServiceExtraction.py
2025-11-19 09:52:34 +01:00

683 lines
28 KiB
Python

from typing import Any, Dict, List, Optional, Union
import uuid
import logging
import time
import asyncio
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions, PartResult
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallResponse, AiCallRequest, AiCallOptions
from modules.aicore.aicoreModelRegistry import modelRegistry
logger = logging.getLogger(__name__)
class ExtractionService:
def __init__(self, services: Optional[Any] = None):
self.services = services
self._extractorRegistry = ExtractorRegistry()
self._chunkerRegistry = ChunkerRegistry()
# Ensure connectors are registered
discovered = modelRegistry.discoverConnectors()
for connector in discovered:
modelRegistry.registerConnector(connector)
# Verify required internal model is available (used for pricing in extractContent)
modelDisplayName = "Internal Document Extractor"
model = modelRegistry.getModel(modelDisplayName)
if model is None or model.calculatePriceUsd is None:
raise RuntimeError(f"FATAL: Required internal model '{modelDisplayName}' is not available. Check connector registration.")
def extractContent(self, documents: List[ChatDocument], options: ExtractionOptions) -> List[ContentExtracted]:
"""
Extract content from a list of ChatDocument objects.
Args:
documents: List of ChatDocument objects to extract content from
options: Extraction options including maxSize, chunkAllowed, mergeStrategy, etc.
Returns:
List of ContentExtracted objects, one per input document
"""
results: List[ContentExtracted] = []
# Lazy import to avoid circular deps and heavy init at module import
from modules.interfaces.interfaceDbComponentObjects import getInterface
dbInterface = getInterface()
for i, doc in enumerate(documents):
logger.info(f"=== DOCUMENT {i}: {doc.fileName} ===")
logger.info(f"Initial MIME type: {doc.mimeType}")
# Start timing for this document
startTime = time.time()
# Resolve raw bytes for this document using interface
documentBytes = dbInterface.getFileData(doc.fileId)
if not documentBytes:
raise ValueError(f"No file data found for fileId={doc.fileId}")
# Convert ChatDocument to the format expected by runExtraction
documentData = {
"id": doc.id,
"bytes": documentBytes,
"fileName": doc.fileName,
"mimeType": doc.mimeType
}
ec = runExtraction(
extractorRegistry=self._extractorRegistry,
chunkerRegistry=self._chunkerRegistry,
documentBytes=documentData["bytes"],
fileName=documentData["fileName"],
mimeType=documentData["mimeType"],
options=options
)
# Log content parts metadata
logger.debug(f"Content parts: {len(ec.parts)}")
for j, part in enumerate(ec.parts):
logger.debug(f" Part {j}: {part.typeGroup} ({part.mimeType}) - {len(part.data) if part.data else 0} chars")
if part.metadata:
logger.debug(f" Metadata: {part.metadata}")
# Attach document id and MIME type to parts if missing
for p in ec.parts:
if "documentId" not in p.metadata:
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
if "documentMimeType" not in p.metadata:
p.metadata["documentMimeType"] = documentData["mimeType"]
# Log chunking information
chunkedParts = [p for p in ec.parts if p.metadata.get("chunk", False)]
if chunkedParts:
logger.debug(f"=== CHUNKING RESULTS ===")
logger.debug(f"Total parts: {len(ec.parts)}")
logger.debug(f"Chunked parts: {len(chunkedParts)}")
for chunk in chunkedParts:
logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})")
else:
logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits")
# Calculate timing and emit stats
endTime = time.time()
processingTime = endTime - startTime
bytesSent = len(documentBytes)
bytesReceived = sum(len(part.data) if part.data else 0 for part in ec.parts)
# Emit stats for extraction operation
# Use internal extraction model for pricing
modelDisplayName = "Internal Document Extractor"
model = modelRegistry.getModel(modelDisplayName)
# Hard fail if model is missing; caller must ensure connectors are registered
if model is None or model.calculatePriceUsd is None:
raise RuntimeError(f"Pricing model not available: {modelDisplayName}")
priceUsd = model.calculatePriceUsd(processingTime, bytesSent, bytesReceived)
# Create AiCallResponse with real calculation
# Use model.name for the response (API identifier), not displayName
aiResponse = AiCallResponse(
content="", # No content for extraction stats needed
modelName=model.name,
priceUsd=priceUsd,
processingTime=processingTime,
bytesSent=bytesSent,
bytesReceived=bytesReceived,
errorCount=0
)
self.services.chat.storeWorkflowStat(
self.services.workflow,
aiResponse,
f"extraction.process.{doc.mimeType}"
)
results.append(ec)
return results
def mergeAiResults(
self,
extractedContent: List[ContentExtracted],
aiResults: List[str],
strategy: MergeStrategy
) -> ContentExtracted:
"""
Merge AI results from chunked content back into a single ContentExtracted.
Args:
extractedContent: List of ContentExtracted objects that were processed
aiResults: List of AI response strings, one per chunk
strategy: Merge strategy configuration (dict or MergeStrategy object)
Returns:
Single ContentExtracted with merged AI results
"""
logger.debug(f"=== MERGING AI RESULTS ===")
logger.debug(f"Extracted content: {len(extractedContent)} documents")
logger.debug(f"AI results: {len(aiResults)} responses")
logger.debug(f"Merge strategy: {strategy.mergeType}")
mergeStrategy = strategy
# Collect all parts from all extracted content
allParts: List[ContentPart] = []
for ec in extractedContent:
allParts.extend(ec.parts)
logger.debug(f"Total original parts: {len(allParts)}")
# Create AI result parts
aiResultParts: List[ContentPart] = []
for i, aiResult in enumerate(aiResults):
aiPart = ContentPart(
id=f"ai_result_{i}",
parentId=None, # Will be set based on strategy
label="ai_result",
typeGroup="text",
mimeType="text/plain",
data=aiResult,
metadata={
"aiResult": True,
"order": i,
"size": len(aiResult.encode('utf-8'))
}
)
aiResultParts.append(aiPart)
logger.debug(f"Created {len(aiResultParts)} AI result parts")
# Apply merging strategy
if mergeStrategy.mergeType == "concatenate":
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
elif mergeStrategy.mergeType == "hierarchical":
mergedParts = self._mergeHierarchical(allParts, aiResultParts, mergeStrategy)
elif mergeStrategy.mergeType == "intelligent":
mergedParts = self._mergeIntelligent(allParts, aiResultParts, mergeStrategy)
else:
# Default to concatenate
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
# Create final ContentExtracted
mergedContent = ContentExtracted(
id=f"merged_{uuid.uuid4()}",
parts=mergedParts
)
logger.debug(f"=== MERGE COMPLETED ===")
logger.debug(f"Final merged parts: {len(mergedParts)}")
logger.debug(f"Merged content ID: {mergedContent.id}")
return mergedContent
def _mergeConcatenate(
self,
originalParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Merge parts by simple concatenation."""
mergedParts = []
# Add original parts (filtered if needed)
for part in originalParts:
if strategy.preserveChunks or not part.metadata.get("chunk", False):
mergedParts.append(part)
# Add AI results
if aiResultParts:
# Group AI results by parentId if available
aiResultsByParent = {}
for aiPart in aiResultParts:
parentId = aiPart.parentId or "root"
if parentId not in aiResultsByParent:
aiResultsByParent[parentId] = []
aiResultsByParent[parentId].append(aiPart)
# Merge AI results for each parent
for parentId, aiParts in aiResultsByParent.items():
if len(aiParts) == 1:
mergedParts.append(aiParts[0])
else:
# Concatenate multiple AI results for same parent
combinedData = strategy.chunkSeparator.join([p.data for p in aiParts])
combinedPart = ContentPart(
id=f"merged_ai_{parentId}",
parentId=parentId if parentId != "root" else None,
label="merged_ai_result",
typeGroup="text",
mimeType="text/plain",
data=combinedData,
metadata={
"aiResult": True,
"merged": True,
"sourceCount": len(aiParts),
"size": len(combinedData.encode('utf-8'))
}
)
mergedParts.append(combinedPart)
return mergedParts
def _mergeHierarchical(
self,
originalParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Merge parts hierarchically based on parentId relationships."""
# Group parts by parentId
partsByParent = {}
for part in originalParts:
parentId = part.parentId or "root"
if parentId not in partsByParent:
partsByParent[parentId] = []
partsByParent[parentId].append(part)
# Group AI results by parentId
aiResultsByParent = {}
for aiPart in aiResultParts:
parentId = aiPart.parentId or "root"
if parentId not in aiResultsByParent:
aiResultsByParent[parentId] = []
aiResultsByParent[parentId].append(aiPart)
mergedParts = []
# Process each parent group
for parentId in set(list(partsByParent.keys()) + list(aiResultsByParent.keys())):
originalGroup = partsByParent.get(parentId, [])
aiGroup = aiResultsByParent.get(parentId, [])
# Add original parts
mergedParts.extend(originalGroup)
# Add AI results for this parent
if aiGroup:
if len(aiGroup) == 1:
mergedParts.append(aiGroup[0])
else:
# Merge multiple AI results
combinedData = strategy.chunkSeparator.join([p.data for p in aiGroup])
combinedPart = ContentPart(
id=f"hierarchical_ai_{parentId}",
parentId=parentId if parentId != "root" else None,
label="hierarchical_ai_result",
typeGroup="text",
mimeType="text/plain",
data=combinedData,
metadata={
"aiResult": True,
"hierarchical": True,
"sourceCount": len(aiGroup),
"size": len(combinedData.encode('utf-8'))
}
)
mergedParts.append(combinedPart)
return mergedParts
def _mergeIntelligent(
self,
originalParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Merge parts using intelligent strategies based on content type."""
mergedParts = []
# Group by typeGroup for intelligent merging
partsByType = {}
for part in originalParts:
typeGroup = part.typeGroup
if typeGroup not in partsByType:
partsByType[typeGroup] = []
partsByType[typeGroup].append(part)
# Process each type group
for typeGroup, parts in partsByType.items():
if typeGroup == "text":
mergedParts.extend(self._mergeTextIntelligent(parts, aiResultParts, strategy))
elif typeGroup == "table":
mergedParts.extend(self._mergeTableIntelligent(parts, aiResultParts, strategy))
elif typeGroup == "structure":
mergedParts.extend(self._mergeStructureIntelligent(parts, aiResultParts, strategy))
else:
# Default handling for other types
mergedParts.extend(parts)
# Add any remaining AI results that weren't merged
for aiPart in aiResultParts:
if not any(p.id == aiPart.id for p in mergedParts):
mergedParts.append(aiPart)
return mergedParts
def _mergeTextIntelligent(
self,
textParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Intelligent merging for text content."""
# For now, use concatenate strategy
# This could be enhanced with semantic analysis, summarization, etc.
return self._mergeConcatenate(textParts, aiResultParts, strategy)
def _mergeTableIntelligent(
self,
tableParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Intelligent merging for table content."""
# For now, use concatenate strategy
# This could be enhanced with table merging logic
return self._mergeConcatenate(tableParts, aiResultParts, strategy)
def _mergeStructureIntelligent(
self,
structureParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Intelligent merging for structured content."""
# For now, use concatenate strategy
# This could be enhanced with structure-aware merging
return self._mergeConcatenate(structureParts, aiResultParts, strategy)
async def processDocumentsPerChunk(
self,
documents: List[ChatDocument],
prompt: str,
aiObjects: Any,
options: Optional[AiCallOptions] = None,
operationId: Optional[str] = None
) -> str:
"""
Process documents with model-aware chunking and merge results.
NEW: Uses model-aware chunking in AI call phase instead of extraction phase.
Args:
documents: List of ChatDocument objects to process
prompt: AI prompt for processing
aiObjects: AiObjects instance for making AI calls
options: AI call options
operationId: Optional operation ID for progress tracking
Returns:
Merged AI results as string with preserved document structure
"""
if not documents:
return ""
# Create operationId if not provided
if not operationId:
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
operationId = f"ai_text_extract_{workflowId}_{int(time.time())}"
self.services.chat.progressLogStart(
operationId,
"AI Text Extract",
"Document Processing",
f"Processing {len(documents)} documents"
)
try:
# Build extraction options using Pydantic model
mergeStrategy = MergeStrategy(
useIntelligentMerging=True,
prompt=prompt,
groupBy="typeGroup",
orderBy="id",
mergeType="concatenate"
)
extractionOptions = ExtractionOptions(
prompt=prompt,
processDocumentsIndividually=True,
mergeStrategy=mergeStrategy
)
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars")
# Extract content WITHOUT chunking
if operationId:
self.services.chat.progressLogUpdate(operationId, 0.1, f"Extracting content from {len(documents)} documents")
extractionResult = self.extractContent(documents, extractionOptions)
if not isinstance(extractionResult, list):
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return "[Error: No extraction results]"
# Process parts (not chunks) with model-aware AI calls
if operationId:
self.services.chat.progressLogUpdate(operationId, 0.3, f"Processing {len(extractionResult)} extracted content parts")
# Get parent log ID for part operations
parentLogId = None
if operationId:
parentLogId = self.services.chat.getOperationLogId(operationId)
partResults = await self._processPartsWithMapping(extractionResult, prompt, aiObjects, options, operationId, parentLogId)
# Merge results using existing merging system
if operationId:
self.services.chat.progressLogUpdate(operationId, 0.9, f"Merging {len(partResults)} part results")
mergedContent = self._mergePartResults(partResults, options)
# Save merged extraction content to debug
self.services.utils.writeDebugFile(mergedContent or '', "extraction_merged_text")
if operationId:
self.services.chat.progressLogFinish(operationId, True)
return mergedContent
except Exception as e:
logger.error(f"Error in processDocumentsPerChunk: {str(e)}")
if operationId:
self.services.chat.progressLogFinish(operationId, False)
raise
async def _processPartsWithMapping(
self,
extractionResult: List[ContentExtracted],
prompt: str,
aiObjects: Any,
options: Optional[AiCallOptions] = None,
operationId: Optional[str] = None,
parentLogId: Optional[str] = None
) -> List[PartResult]:
"""Process content parts with model-aware chunking and proper mapping."""
# Collect all parts that need processing
partsToProcess = []
partIndex = 0
for ec in extractionResult:
for part in ec.parts:
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
# Skip empty container parts
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
logger.debug(f"Skipping empty container part: mimeType={part.mimeType}")
continue
partsToProcess.append({
'part': part,
'part_index': partIndex,
'document_id': ec.id
})
partIndex += 1
logger.info(f"Processing {len(partsToProcess)} parts with model-aware chunking")
totalParts = len(partsToProcess)
# Process parts in parallel
processedCount = [0] # Use list to allow modification in nested function
async def processSinglePart(partInfo: Dict) -> PartResult:
part = partInfo['part']
part_index = partInfo['part_index']
documentId = partInfo['document_id']
start_time = time.time()
# Create separate operation for each part with parent reference
partOperationId = None
if operationId:
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
partOperationId = f"{operationId}_part_{part_index}"
self.services.chat.progressLogStart(
partOperationId,
"Content Processing",
f"Part {part_index + 1}",
f"Type: {part.typeGroup}",
parentId=parentLogId
)
try:
# Create AI call request with content part
request = AiCallRequest(
prompt=prompt,
context="", # Context is in the content part
options=options,
contentParts=[part] # Pass as list for unified processing
)
# Update progress - initiating
if partOperationId:
self.services.chat.progressLogUpdate(partOperationId, 0.3, "Initiating")
# Call AI with model-aware chunking (no progress callback - handled by parent operation)
response = await aiObjects.call(request)
# Update progress - completed
if partOperationId:
self.services.chat.progressLogUpdate(partOperationId, 0.9, "Completed")
self.services.chat.progressLogFinish(partOperationId, True)
processing_time = time.time() - start_time
return PartResult(
originalPart=part,
aiResult=response.content,
partIndex=part_index,
documentId=documentId,
processingTime=processing_time,
metadata={
"success": True,
"partSize": len(part.data) if part.data else 0,
"resultSize": len(response.content),
"typeGroup": part.typeGroup,
"modelName": response.modelName,
"priceUsd": response.priceUsd
}
)
except Exception as e:
processing_time = time.time() - start_time
logger.warning(f"Error processing part {part_index}: {str(e)}")
return PartResult(
originalPart=part,
aiResult=f"[Error processing part: {str(e)}]",
partIndex=part_index,
documentId=documentId,
processingTime=processing_time,
metadata={
"success": False,
"error": str(e),
"partSize": len(part.data) if part.data else 0,
"typeGroup": part.typeGroup
}
)
# Process parts with concurrency control
maxConcurrent = 5
if options and hasattr(options, 'maxConcurrentParts'):
maxConcurrent = options.maxConcurrentParts
semaphore = asyncio.Semaphore(maxConcurrent)
async def processWithSemaphore(partInfo):
async with semaphore:
return await processSinglePart(partInfo)
tasks = [processWithSemaphore(part_info) for part_info in partsToProcess]
partResults = await asyncio.gather(*tasks, return_exceptions=True)
# Handle exceptions
processedResults = []
for i, result in enumerate(partResults):
if isinstance(result, Exception):
part_info = partsToProcess[i]
processedResults.append(PartResult(
originalPart=part_info['part'],
aiResult=f"[Error in parallel processing: {str(result)}]",
partIndex=part_info['part_index'],
documentId=part_info['document_id'],
processingTime=0.0,
metadata={"success": False, "error": str(result)}
))
elif result is not None:
processedResults.append(result)
logger.info(f"Completed processing {len(processedResults)} parts")
return processedResults
def _mergePartResults(
self,
partResults: List[PartResult],
options: Optional[AiCallOptions] = None
) -> str:
"""Merge part results using existing sophisticated merging system."""
if not partResults:
return ""
# Convert PartResults back to ContentParts for existing merger system
content_parts = []
for part_result in partResults:
# Create ContentPart from PartResult with proper typeGroup
content_part = ContentPart(
id=part_result.originalPart.id,
parentId=part_result.originalPart.parentId,
label=part_result.originalPart.label,
typeGroup=part_result.originalPart.typeGroup, # Use original typeGroup
mimeType=part_result.originalPart.mimeType,
data=part_result.aiResult, # Use AI result as data
metadata={
**part_result.originalPart.metadata,
"aiResult": True,
"partIndex": part_result.partIndex,
"documentId": part_result.documentId,
"processingTime": part_result.processingTime,
"success": part_result.metadata.get("success", False)
}
)
content_parts.append(content_part)
# Use existing merging strategy from options
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="documentId", # Group by document
orderBy="partIndex", # Order by part index
mergeType="concatenate"
)
# Apply existing merging logic using the sophisticated merging system
from modules.interfaces.interfaceAiObjects import applyMerging
merged_parts = applyMerging(content_parts, merge_strategy)
# Convert merged parts back to final string
final_content = "\n\n".join([part.data for part in merged_parts])
logger.info(f"Merged {len(partResults)} parts using existing sophisticated merging system")
return final_content.strip()