gateway/modules/services/serviceExtraction/mainServiceExtraction.py
2025-10-31 00:28:09 +01:00

660 lines
27 KiB
Python

from typing import Any, Dict, List, Optional, Union
import uuid
import logging
import time
import asyncio
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .subPipeline import runExtraction, _applyMerging
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions, PartResult
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallResponse, AiCallRequest, AiCallOptions, OperationTypeEnum
from modules.aicore.aicoreModelRegistry import modelRegistry
logger = logging.getLogger(__name__)
class ExtractionService:
def __init__(self, services: Optional[Any] = None):
self.services = services
self._extractorRegistry = ExtractorRegistry()
self._chunkerRegistry = ChunkerRegistry()
# Ensure AI connectors are discovered so pricing models are available
try:
# If internal model is missing, trigger discovery and registration
if modelRegistry.getModel("internal-extractor") is None:
discovered = modelRegistry.discoverConnectors()
for connector in discovered:
modelRegistry.registerConnector(connector)
except Exception:
# Propagate actual errors during use; init should be fast and side-effect free otherwise
pass
def extractContent(self, documents: List[ChatDocument], options: ExtractionOptions) -> List[ContentExtracted]:
"""
Extract content from a list of ChatDocument objects.
Args:
documents: List of ChatDocument objects to extract content from
options: Extraction options including maxSize, chunkAllowed, mergeStrategy, etc.
Returns:
List of ContentExtracted objects, one per input document
"""
results: List[ContentExtracted] = []
# Lazy import to avoid circular deps and heavy init at module import
from modules.interfaces.interfaceDbComponentObjects import getInterface
dbInterface = getInterface()
for i, doc in enumerate(documents):
logger.info(f"=== DOCUMENT {i}: {doc.fileName} ===")
logger.info(f"Initial MIME type: {doc.mimeType}")
# Start timing for this document
startTime = time.time()
# Resolve raw bytes for this document using interface
documentBytes = dbInterface.getFileData(doc.fileId)
if not documentBytes:
raise ValueError(f"No file data found for fileId={doc.fileId}")
# Convert ChatDocument to the format expected by runExtraction
documentData = {
"id": doc.id,
"bytes": documentBytes,
"fileName": doc.fileName,
"mimeType": doc.mimeType
}
ec = runExtraction(
extractorRegistry=self._extractorRegistry,
chunkerRegistry=self._chunkerRegistry,
documentBytes=documentData["bytes"],
fileName=documentData["fileName"],
mimeType=documentData["mimeType"],
options=options
)
# Log content parts metadata
logger.debug(f"Content parts: {len(ec.parts)}")
for j, part in enumerate(ec.parts):
logger.debug(f" Part {j}: {part.typeGroup} ({part.mimeType}) - {len(part.data) if part.data else 0} chars")
if part.metadata:
logger.debug(f" Metadata: {part.metadata}")
# Attach document id and MIME type to parts if missing
for p in ec.parts:
if "documentId" not in p.metadata:
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
if "documentMimeType" not in p.metadata:
p.metadata["documentMimeType"] = documentData["mimeType"]
# Log chunking information
chunkedParts = [p for p in ec.parts if p.metadata.get("chunk", False)]
if chunkedParts:
logger.debug(f"=== CHUNKING RESULTS ===")
logger.debug(f"Total parts: {len(ec.parts)}")
logger.debug(f"Chunked parts: {len(chunkedParts)}")
for chunk in chunkedParts:
logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})")
else:
logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits")
# Calculate timing and emit stats
endTime = time.time()
processingTime = endTime - startTime
bytesSent = len(documentBytes)
bytesReceived = sum(len(part.data) if part.data else 0 for part in ec.parts)
# Emit stats for extraction operation
# Use internal extraction model for pricing
modelName = "internal-extractor"
model = modelRegistry.getModel(modelName)
# Hard fail if model is missing; caller must ensure connectors are registered
if model is None or model.calculatePriceUsd is None:
raise RuntimeError(f"Pricing model not available: {modelName}")
priceUsd = model.calculatePriceUsd(processingTime, bytesSent, bytesReceived)
# Create AiCallResponse with real calculation
aiResponse = AiCallResponse(
content="", # No content for extraction stats needed
modelName=modelName,
priceUsd=priceUsd,
processingTime=processingTime,
bytesSent=bytesSent,
bytesReceived=bytesReceived,
errorCount=0
)
self.services.workflow.storeWorkflowStat(
self.services.currentWorkflow,
aiResponse,
f"extraction.process.{doc.mimeType}"
)
results.append(ec)
return results
def mergeAiResults(
self,
extractedContent: List[ContentExtracted],
aiResults: List[str],
strategy: MergeStrategy
) -> ContentExtracted:
"""
Merge AI results from chunked content back into a single ContentExtracted.
Args:
extractedContent: List of ContentExtracted objects that were processed
aiResults: List of AI response strings, one per chunk
strategy: Merge strategy configuration (dict or MergeStrategy object)
Returns:
Single ContentExtracted with merged AI results
"""
logger.debug(f"=== MERGING AI RESULTS ===")
logger.debug(f"Extracted content: {len(extractedContent)} documents")
logger.debug(f"AI results: {len(aiResults)} responses")
logger.debug(f"Merge strategy: {strategy.mergeType}")
mergeStrategy = strategy
# Collect all parts from all extracted content
allParts: List[ContentPart] = []
for ec in extractedContent:
allParts.extend(ec.parts)
logger.debug(f"Total original parts: {len(allParts)}")
# Create AI result parts
aiResultParts: List[ContentPart] = []
for i, aiResult in enumerate(aiResults):
aiPart = ContentPart(
id=f"ai_result_{i}",
parentId=None, # Will be set based on strategy
label="ai_result",
typeGroup="text",
mimeType="text/plain",
data=aiResult,
metadata={
"aiResult": True,
"order": i,
"size": len(aiResult.encode('utf-8'))
}
)
aiResultParts.append(aiPart)
logger.debug(f"Created {len(aiResultParts)} AI result parts")
# Apply merging strategy
if mergeStrategy.mergeType == "concatenate":
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
elif mergeStrategy.mergeType == "hierarchical":
mergedParts = self._mergeHierarchical(allParts, aiResultParts, mergeStrategy)
elif mergeStrategy.mergeType == "intelligent":
mergedParts = self._mergeIntelligent(allParts, aiResultParts, mergeStrategy)
else:
# Default to concatenate
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
# Create final ContentExtracted
mergedContent = ContentExtracted(
id=f"merged_{uuid.uuid4()}",
parts=mergedParts
)
logger.debug(f"=== MERGE COMPLETED ===")
logger.debug(f"Final merged parts: {len(mergedParts)}")
logger.debug(f"Merged content ID: {mergedContent.id}")
return mergedContent
def _mergeConcatenate(
self,
originalParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Merge parts by simple concatenation."""
mergedParts = []
# Add original parts (filtered if needed)
for part in originalParts:
if strategy.preserveChunks or not part.metadata.get("chunk", False):
mergedParts.append(part)
# Add AI results
if aiResultParts:
# Group AI results by parentId if available
aiResultsByParent = {}
for aiPart in aiResultParts:
parentId = aiPart.parentId or "root"
if parentId not in aiResultsByParent:
aiResultsByParent[parentId] = []
aiResultsByParent[parentId].append(aiPart)
# Merge AI results for each parent
for parentId, aiParts in aiResultsByParent.items():
if len(aiParts) == 1:
mergedParts.append(aiParts[0])
else:
# Concatenate multiple AI results for same parent
combinedData = strategy.chunkSeparator.join([p.data for p in aiParts])
combinedPart = ContentPart(
id=f"merged_ai_{parentId}",
parentId=parentId if parentId != "root" else None,
label="merged_ai_result",
typeGroup="text",
mimeType="text/plain",
data=combinedData,
metadata={
"aiResult": True,
"merged": True,
"sourceCount": len(aiParts),
"size": len(combinedData.encode('utf-8'))
}
)
mergedParts.append(combinedPart)
return mergedParts
def _mergeHierarchical(
self,
originalParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Merge parts hierarchically based on parentId relationships."""
# Group parts by parentId
partsByParent = {}
for part in originalParts:
parentId = part.parentId or "root"
if parentId not in partsByParent:
partsByParent[parentId] = []
partsByParent[parentId].append(part)
# Group AI results by parentId
aiResultsByParent = {}
for aiPart in aiResultParts:
parentId = aiPart.parentId or "root"
if parentId not in aiResultsByParent:
aiResultsByParent[parentId] = []
aiResultsByParent[parentId].append(aiPart)
mergedParts = []
# Process each parent group
for parentId in set(list(partsByParent.keys()) + list(aiResultsByParent.keys())):
originalGroup = partsByParent.get(parentId, [])
aiGroup = aiResultsByParent.get(parentId, [])
# Add original parts
mergedParts.extend(originalGroup)
# Add AI results for this parent
if aiGroup:
if len(aiGroup) == 1:
mergedParts.append(aiGroup[0])
else:
# Merge multiple AI results
combinedData = strategy.chunkSeparator.join([p.data for p in aiGroup])
combinedPart = ContentPart(
id=f"hierarchical_ai_{parentId}",
parentId=parentId if parentId != "root" else None,
label="hierarchical_ai_result",
typeGroup="text",
mimeType="text/plain",
data=combinedData,
metadata={
"aiResult": True,
"hierarchical": True,
"sourceCount": len(aiGroup),
"size": len(combinedData.encode('utf-8'))
}
)
mergedParts.append(combinedPart)
return mergedParts
def _mergeIntelligent(
self,
originalParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Merge parts using intelligent strategies based on content type."""
mergedParts = []
# Group by typeGroup for intelligent merging
partsByType = {}
for part in originalParts:
typeGroup = part.typeGroup
if typeGroup not in partsByType:
partsByType[typeGroup] = []
partsByType[typeGroup].append(part)
# Process each type group
for typeGroup, parts in partsByType.items():
if typeGroup == "text":
mergedParts.extend(self._mergeTextIntelligent(parts, aiResultParts, strategy))
elif typeGroup == "table":
mergedParts.extend(self._mergeTableIntelligent(parts, aiResultParts, strategy))
elif typeGroup == "structure":
mergedParts.extend(self._mergeStructureIntelligent(parts, aiResultParts, strategy))
else:
# Default handling for other types
mergedParts.extend(parts)
# Add any remaining AI results that weren't merged
for aiPart in aiResultParts:
if not any(p.id == aiPart.id for p in mergedParts):
mergedParts.append(aiPart)
return mergedParts
def _mergeTextIntelligent(
self,
textParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Intelligent merging for text content."""
# For now, use concatenate strategy
# This could be enhanced with semantic analysis, summarization, etc.
return self._mergeConcatenate(textParts, aiResultParts, strategy)
def _mergeTableIntelligent(
self,
tableParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Intelligent merging for table content."""
# For now, use concatenate strategy
# This could be enhanced with table merging logic
return self._mergeConcatenate(tableParts, aiResultParts, strategy)
def _mergeStructureIntelligent(
self,
structureParts: List[ContentPart],
aiResultParts: List[ContentPart],
strategy: MergeStrategy
) -> List[ContentPart]:
"""Intelligent merging for structured content."""
# For now, use concatenate strategy
# This could be enhanced with structure-aware merging
return self._mergeConcatenate(structureParts, aiResultParts, strategy)
async def processDocumentsPerChunk(
self,
documents: List[ChatDocument],
prompt: str,
aiObjects: Any,
options: Optional[AiCallOptions] = None,
operationId: Optional[str] = None
) -> str:
"""
Process documents with model-aware chunking and merge results.
NEW: Uses model-aware chunking in AI call phase instead of extraction phase.
Args:
documents: List of ChatDocument objects to process
prompt: AI prompt for processing
aiObjects: AiObjects instance for making AI calls
options: AI call options
operationId: Optional operation ID for progress tracking
Returns:
Merged AI results as string with preserved document structure
"""
if not documents:
return ""
# Create operationId if not provided
if not operationId:
workflowId = self.services.currentWorkflow.id if self.services.currentWorkflow else f"no-workflow-{int(time.time())}"
operationId = f"ai_text_extract_{workflowId}_{int(time.time())}"
self.services.workflow.progressLogStart(
operationId,
"AI Text Extract",
"Document Processing",
f"Processing {len(documents)} documents"
)
try:
# Build extraction options using Pydantic model
mergeStrategy = MergeStrategy(
useIntelligentMerging=True,
prompt=prompt,
groupBy="typeGroup",
orderBy="id",
mergeType="concatenate"
)
extractionOptions = ExtractionOptions(
prompt=prompt,
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
processDocumentsIndividually=True,
mergeStrategy=mergeStrategy
)
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
# Extract content WITHOUT chunking
if operationId:
self.services.workflow.progressLogUpdate(operationId, 0.1, f"Extracting content from {len(documents)} documents")
extractionResult = self.extractContent(documents, extractionOptions)
if not isinstance(extractionResult, list):
if operationId:
self.services.workflow.progressLogFinish(operationId, False)
return "[Error: No extraction results]"
# Process parts (not chunks) with model-aware AI calls
if operationId:
self.services.workflow.progressLogUpdate(operationId, 0.3, f"Processing {len(extractionResult)} extracted content parts")
partResults = await self._processPartsWithMapping(extractionResult, prompt, aiObjects, options, operationId)
# Merge results using existing merging system
if operationId:
self.services.workflow.progressLogUpdate(operationId, 0.9, f"Merging {len(partResults)} part results")
mergedContent = self._mergePartResults(partResults, options)
# Save merged extraction content to debug
self.services.utils.writeDebugFile(mergedContent or '', "extraction_merged_text")
if operationId:
self.services.workflow.progressLogFinish(operationId, True)
return mergedContent
except Exception as e:
logger.error(f"Error in processDocumentsPerChunk: {str(e)}")
if operationId:
self.services.workflow.progressLogFinish(operationId, False)
raise
async def _processPartsWithMapping(
self,
extractionResult: List[ContentExtracted],
prompt: str,
aiObjects: Any,
options: Optional[AiCallOptions] = None,
operationId: Optional[str] = None
) -> List[PartResult]:
"""Process content parts with model-aware chunking and proper mapping."""
# Collect all parts that need processing
partsToProcess = []
partIndex = 0
for ec in extractionResult:
for part in ec.parts:
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
# Skip empty container parts
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
logger.debug(f"Skipping empty container part: mimeType={part.mimeType}")
continue
partsToProcess.append({
'part': part,
'part_index': partIndex,
'document_id': ec.id
})
partIndex += 1
logger.info(f"Processing {len(partsToProcess)} parts with model-aware chunking")
totalParts = len(partsToProcess)
# Process parts in parallel
processedCount = [0] # Use list to allow modification in nested function
async def processSinglePart(partInfo: Dict) -> PartResult:
part = partInfo['part']
part_index = partInfo['part_index']
documentId = partInfo['document_id']
start_time = time.time()
try:
# Create AI call request with content part
request = AiCallRequest(
prompt=prompt,
context="", # Context is in the content part
options=options,
contentParts=[part] # Pass as list for unified processing
)
# Update progress before AI call
if operationId and totalParts > 0:
processedCount[0] += 1
progress = 0.3 + (processedCount[0] / totalParts * 0.6) # Progress from 0.3 to 0.9
self.services.workflow.progressLogUpdate(operationId, progress, f"Processing part {processedCount[0]}/{totalParts}")
# Call AI with model-aware chunking
response = await aiObjects.call(request)
processing_time = time.time() - start_time
return PartResult(
originalPart=part,
aiResult=response.content,
partIndex=part_index,
documentId=documentId,
processingTime=processing_time,
metadata={
"success": True,
"partSize": len(part.data) if part.data else 0,
"resultSize": len(response.content),
"typeGroup": part.typeGroup,
"modelName": response.modelName,
"priceUsd": response.priceUsd
}
)
except Exception as e:
processing_time = time.time() - start_time
logger.warning(f"Error processing part {part_index}: {str(e)}")
return PartResult(
originalPart=part,
aiResult=f"[Error processing part: {str(e)}]",
partIndex=part_index,
documentId=documentId,
processingTime=processing_time,
metadata={
"success": False,
"error": str(e),
"partSize": len(part.data) if part.data else 0,
"typeGroup": part.typeGroup
}
)
# Process parts with concurrency control
maxConcurrent = 5
if options and hasattr(options, 'maxConcurrentParts'):
maxConcurrent = options.maxConcurrentParts
semaphore = asyncio.Semaphore(maxConcurrent)
async def processWithSemaphore(partInfo):
async with semaphore:
return await processSinglePart(partInfo)
tasks = [processWithSemaphore(part_info) for part_info in partsToProcess]
partResults = await asyncio.gather(*tasks, return_exceptions=True)
# Handle exceptions
processedResults = []
for i, result in enumerate(partResults):
if isinstance(result, Exception):
part_info = partsToProcess[i]
processedResults.append(PartResult(
originalPart=part_info['part'],
aiResult=f"[Error in parallel processing: {str(result)}]",
partIndex=part_info['part_index'],
documentId=part_info['document_id'],
processingTime=0.0,
metadata={"success": False, "error": str(result)}
))
elif result is not None:
processedResults.append(result)
logger.info(f"Completed processing {len(processedResults)} parts")
return processedResults
def _mergePartResults(
self,
partResults: List[PartResult],
options: Optional[AiCallOptions] = None
) -> str:
"""Merge part results using existing sophisticated merging system."""
if not partResults:
return ""
# Convert PartResults back to ContentParts for existing merger system
content_parts = []
for part_result in partResults:
# Create ContentPart from PartResult with proper typeGroup
content_part = ContentPart(
id=part_result.originalPart.id,
parentId=part_result.originalPart.parentId,
label=part_result.originalPart.label,
typeGroup=part_result.originalPart.typeGroup, # Use original typeGroup
mimeType=part_result.originalPart.mimeType,
data=part_result.aiResult, # Use AI result as data
metadata={
**part_result.originalPart.metadata,
"aiResult": True,
"partIndex": part_result.partIndex,
"documentId": part_result.documentId,
"processingTime": part_result.processingTime,
"success": part_result.metadata.get("success", False)
}
)
content_parts.append(content_part)
# Use existing merging strategy from options
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="documentId", # Group by document
orderBy="partIndex", # Order by part index
mergeType="concatenate"
)
# Apply existing merging logic using the sophisticated merging system
merged_parts = _applyMerging(content_parts, merge_strategy)
# Convert merged parts back to final string
final_content = "\n\n".join([part.data for part in merged_parts])
logger.info(f"Merged {len(partResults)} parts using existing sophisticated merging system")
return final_content.strip()