from typing import Any, Dict, List, Optional, Union import uuid import logging import time import asyncio from .subRegistry import ExtractorRegistry, ChunkerRegistry from .subPipeline import runExtraction, _applyMerging from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions, PartResult from modules.datamodels.datamodelChat import ChatDocument from modules.datamodels.datamodelAi import AiCallResponse, AiCallRequest, AiCallOptions, OperationTypeEnum from modules.aicore.aicoreModelRegistry import modelRegistry logger = logging.getLogger(__name__) class ExtractionService: def __init__(self, services: Optional[Any] = None): self.services = services self._extractorRegistry = ExtractorRegistry() self._chunkerRegistry = ChunkerRegistry() # Ensure AI connectors are discovered so pricing models are available try: # If internal model is missing, trigger discovery and registration if modelRegistry.getModel("internal-extractor") is None: discovered = modelRegistry.discoverConnectors() for connector in discovered: modelRegistry.registerConnector(connector) except Exception: # Propagate actual errors during use; init should be fast and side-effect free otherwise pass def extractContent(self, documents: List[ChatDocument], options: ExtractionOptions) -> List[ContentExtracted]: """ Extract content from a list of ChatDocument objects. Args: documents: List of ChatDocument objects to extract content from options: Extraction options including maxSize, chunkAllowed, mergeStrategy, etc. Returns: List of ContentExtracted objects, one per input document """ results: List[ContentExtracted] = [] # Lazy import to avoid circular deps and heavy init at module import from modules.interfaces.interfaceDbComponentObjects import getInterface dbInterface = getInterface() for i, doc in enumerate(documents): logger.info(f"=== DOCUMENT {i}: {doc.fileName} ===") logger.info(f"Initial MIME type: {doc.mimeType}") # Start timing for this document startTime = time.time() # Resolve raw bytes for this document using interface documentBytes = dbInterface.getFileData(doc.fileId) if not documentBytes: raise ValueError(f"No file data found for fileId={doc.fileId}") # Convert ChatDocument to the format expected by runExtraction documentData = { "id": doc.id, "bytes": documentBytes, "fileName": doc.fileName, "mimeType": doc.mimeType } ec = runExtraction( extractorRegistry=self._extractorRegistry, chunkerRegistry=self._chunkerRegistry, documentBytes=documentData["bytes"], fileName=documentData["fileName"], mimeType=documentData["mimeType"], options=options ) # Log content parts metadata logger.debug(f"Content parts: {len(ec.parts)}") for j, part in enumerate(ec.parts): logger.debug(f" Part {j}: {part.typeGroup} ({part.mimeType}) - {len(part.data) if part.data else 0} chars") if part.metadata: logger.debug(f" Metadata: {part.metadata}") # Attach document id and MIME type to parts if missing for p in ec.parts: if "documentId" not in p.metadata: p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4()) if "documentMimeType" not in p.metadata: p.metadata["documentMimeType"] = documentData["mimeType"] # Log chunking information chunkedParts = [p for p in ec.parts if p.metadata.get("chunk", False)] if chunkedParts: logger.debug(f"=== CHUNKING RESULTS ===") logger.debug(f"Total parts: {len(ec.parts)}") logger.debug(f"Chunked parts: {len(chunkedParts)}") for chunk in chunkedParts: logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})") else: logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits") # Calculate timing and emit stats endTime = time.time() processingTime = endTime - startTime bytesSent = len(documentBytes) bytesReceived = sum(len(part.data) if part.data else 0 for part in ec.parts) # Emit stats for extraction operation # Use internal extraction model for pricing modelName = "internal-extractor" model = modelRegistry.getModel(modelName) # Hard fail if model is missing; caller must ensure connectors are registered if model is None or model.calculatePriceUsd is None: raise RuntimeError(f"Pricing model not available: {modelName}") priceUsd = model.calculatePriceUsd(processingTime, bytesSent, bytesReceived) # Create AiCallResponse with real calculation aiResponse = AiCallResponse( content="", # No content for extraction stats needed modelName=modelName, priceUsd=priceUsd, processingTime=processingTime, bytesSent=bytesSent, bytesReceived=bytesReceived, errorCount=0 ) self.services.workflow.storeWorkflowStat( self.services.currentWorkflow, aiResponse, f"extraction.process.{doc.mimeType}" ) results.append(ec) return results def mergeAiResults( self, extractedContent: List[ContentExtracted], aiResults: List[str], strategy: MergeStrategy ) -> ContentExtracted: """ Merge AI results from chunked content back into a single ContentExtracted. Args: extractedContent: List of ContentExtracted objects that were processed aiResults: List of AI response strings, one per chunk strategy: Merge strategy configuration (dict or MergeStrategy object) Returns: Single ContentExtracted with merged AI results """ logger.debug(f"=== MERGING AI RESULTS ===") logger.debug(f"Extracted content: {len(extractedContent)} documents") logger.debug(f"AI results: {len(aiResults)} responses") logger.debug(f"Merge strategy: {strategy.mergeType}") mergeStrategy = strategy # Collect all parts from all extracted content allParts: List[ContentPart] = [] for ec in extractedContent: allParts.extend(ec.parts) logger.debug(f"Total original parts: {len(allParts)}") # Create AI result parts aiResultParts: List[ContentPart] = [] for i, aiResult in enumerate(aiResults): aiPart = ContentPart( id=f"ai_result_{i}", parentId=None, # Will be set based on strategy label="ai_result", typeGroup="text", mimeType="text/plain", data=aiResult, metadata={ "aiResult": True, "order": i, "size": len(aiResult.encode('utf-8')) } ) aiResultParts.append(aiPart) logger.debug(f"Created {len(aiResultParts)} AI result parts") # Apply merging strategy if mergeStrategy.mergeType == "concatenate": mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy) elif mergeStrategy.mergeType == "hierarchical": mergedParts = self._mergeHierarchical(allParts, aiResultParts, mergeStrategy) elif mergeStrategy.mergeType == "intelligent": mergedParts = self._mergeIntelligent(allParts, aiResultParts, mergeStrategy) else: # Default to concatenate mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy) # Create final ContentExtracted mergedContent = ContentExtracted( id=f"merged_{uuid.uuid4()}", parts=mergedParts ) logger.debug(f"=== MERGE COMPLETED ===") logger.debug(f"Final merged parts: {len(mergedParts)}") logger.debug(f"Merged content ID: {mergedContent.id}") return mergedContent def _mergeConcatenate( self, originalParts: List[ContentPart], aiResultParts: List[ContentPart], strategy: MergeStrategy ) -> List[ContentPart]: """Merge parts by simple concatenation.""" mergedParts = [] # Add original parts (filtered if needed) for part in originalParts: if strategy.preserveChunks or not part.metadata.get("chunk", False): mergedParts.append(part) # Add AI results if aiResultParts: # Group AI results by parentId if available aiResultsByParent = {} for aiPart in aiResultParts: parentId = aiPart.parentId or "root" if parentId not in aiResultsByParent: aiResultsByParent[parentId] = [] aiResultsByParent[parentId].append(aiPart) # Merge AI results for each parent for parentId, aiParts in aiResultsByParent.items(): if len(aiParts) == 1: mergedParts.append(aiParts[0]) else: # Concatenate multiple AI results for same parent combinedData = strategy.chunkSeparator.join([p.data for p in aiParts]) combinedPart = ContentPart( id=f"merged_ai_{parentId}", parentId=parentId if parentId != "root" else None, label="merged_ai_result", typeGroup="text", mimeType="text/plain", data=combinedData, metadata={ "aiResult": True, "merged": True, "sourceCount": len(aiParts), "size": len(combinedData.encode('utf-8')) } ) mergedParts.append(combinedPart) return mergedParts def _mergeHierarchical( self, originalParts: List[ContentPart], aiResultParts: List[ContentPart], strategy: MergeStrategy ) -> List[ContentPart]: """Merge parts hierarchically based on parentId relationships.""" # Group parts by parentId partsByParent = {} for part in originalParts: parentId = part.parentId or "root" if parentId not in partsByParent: partsByParent[parentId] = [] partsByParent[parentId].append(part) # Group AI results by parentId aiResultsByParent = {} for aiPart in aiResultParts: parentId = aiPart.parentId or "root" if parentId not in aiResultsByParent: aiResultsByParent[parentId] = [] aiResultsByParent[parentId].append(aiPart) mergedParts = [] # Process each parent group for parentId in set(list(partsByParent.keys()) + list(aiResultsByParent.keys())): originalGroup = partsByParent.get(parentId, []) aiGroup = aiResultsByParent.get(parentId, []) # Add original parts mergedParts.extend(originalGroup) # Add AI results for this parent if aiGroup: if len(aiGroup) == 1: mergedParts.append(aiGroup[0]) else: # Merge multiple AI results combinedData = strategy.chunkSeparator.join([p.data for p in aiGroup]) combinedPart = ContentPart( id=f"hierarchical_ai_{parentId}", parentId=parentId if parentId != "root" else None, label="hierarchical_ai_result", typeGroup="text", mimeType="text/plain", data=combinedData, metadata={ "aiResult": True, "hierarchical": True, "sourceCount": len(aiGroup), "size": len(combinedData.encode('utf-8')) } ) mergedParts.append(combinedPart) return mergedParts def _mergeIntelligent( self, originalParts: List[ContentPart], aiResultParts: List[ContentPart], strategy: MergeStrategy ) -> List[ContentPart]: """Merge parts using intelligent strategies based on content type.""" mergedParts = [] # Group by typeGroup for intelligent merging partsByType = {} for part in originalParts: typeGroup = part.typeGroup if typeGroup not in partsByType: partsByType[typeGroup] = [] partsByType[typeGroup].append(part) # Process each type group for typeGroup, parts in partsByType.items(): if typeGroup == "text": mergedParts.extend(self._mergeTextIntelligent(parts, aiResultParts, strategy)) elif typeGroup == "table": mergedParts.extend(self._mergeTableIntelligent(parts, aiResultParts, strategy)) elif typeGroup == "structure": mergedParts.extend(self._mergeStructureIntelligent(parts, aiResultParts, strategy)) else: # Default handling for other types mergedParts.extend(parts) # Add any remaining AI results that weren't merged for aiPart in aiResultParts: if not any(p.id == aiPart.id for p in mergedParts): mergedParts.append(aiPart) return mergedParts def _mergeTextIntelligent( self, textParts: List[ContentPart], aiResultParts: List[ContentPart], strategy: MergeStrategy ) -> List[ContentPart]: """Intelligent merging for text content.""" # For now, use concatenate strategy # This could be enhanced with semantic analysis, summarization, etc. return self._mergeConcatenate(textParts, aiResultParts, strategy) def _mergeTableIntelligent( self, tableParts: List[ContentPart], aiResultParts: List[ContentPart], strategy: MergeStrategy ) -> List[ContentPart]: """Intelligent merging for table content.""" # For now, use concatenate strategy # This could be enhanced with table merging logic return self._mergeConcatenate(tableParts, aiResultParts, strategy) def _mergeStructureIntelligent( self, structureParts: List[ContentPart], aiResultParts: List[ContentPart], strategy: MergeStrategy ) -> List[ContentPart]: """Intelligent merging for structured content.""" # For now, use concatenate strategy # This could be enhanced with structure-aware merging return self._mergeConcatenate(structureParts, aiResultParts, strategy) async def processDocumentsPerChunk( self, documents: List[ChatDocument], prompt: str, aiObjects: Any, options: Optional[AiCallOptions] = None, operationId: Optional[str] = None ) -> str: """ Process documents with model-aware chunking and merge results. NEW: Uses model-aware chunking in AI call phase instead of extraction phase. Args: documents: List of ChatDocument objects to process prompt: AI prompt for processing aiObjects: AiObjects instance for making AI calls options: AI call options operationId: Optional operation ID for progress tracking Returns: Merged AI results as string with preserved document structure """ if not documents: return "" # Create operationId if not provided if not operationId: workflowId = self.services.currentWorkflow.id if self.services.currentWorkflow else f"no-workflow-{int(time.time())}" operationId = f"ai_text_extract_{workflowId}_{int(time.time())}" self.services.workflow.progressLogStart( operationId, "AI Text Extract", "Document Processing", f"Processing {len(documents)} documents" ) try: # Build extraction options using Pydantic model mergeStrategy = MergeStrategy( useIntelligentMerging=True, prompt=prompt, groupBy="typeGroup", orderBy="id", mergeType="concatenate" ) extractionOptions = ExtractionOptions( prompt=prompt, operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT, processDocumentsIndividually=True, mergeStrategy=mergeStrategy ) logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}") # Extract content WITHOUT chunking if operationId: self.services.workflow.progressLogUpdate(operationId, 0.1, f"Extracting content from {len(documents)} documents") extractionResult = self.extractContent(documents, extractionOptions) if not isinstance(extractionResult, list): if operationId: self.services.workflow.progressLogFinish(operationId, False) return "[Error: No extraction results]" # Process parts (not chunks) with model-aware AI calls if operationId: self.services.workflow.progressLogUpdate(operationId, 0.3, f"Processing {len(extractionResult)} extracted content parts") partResults = await self._processPartsWithMapping(extractionResult, prompt, aiObjects, options, operationId) # Merge results using existing merging system if operationId: self.services.workflow.progressLogUpdate(operationId, 0.9, f"Merging {len(partResults)} part results") mergedContent = self._mergePartResults(partResults, options) # Save merged extraction content to debug self.services.utils.writeDebugFile(mergedContent or '', "extraction_merged_text") if operationId: self.services.workflow.progressLogFinish(operationId, True) return mergedContent except Exception as e: logger.error(f"Error in processDocumentsPerChunk: {str(e)}") if operationId: self.services.workflow.progressLogFinish(operationId, False) raise async def _processPartsWithMapping( self, extractionResult: List[ContentExtracted], prompt: str, aiObjects: Any, options: Optional[AiCallOptions] = None, operationId: Optional[str] = None ) -> List[PartResult]: """Process content parts with model-aware chunking and proper mapping.""" # Collect all parts that need processing partsToProcess = [] partIndex = 0 for ec in extractionResult: for part in ec.parts: if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"): # Skip empty container parts if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0): logger.debug(f"Skipping empty container part: mimeType={part.mimeType}") continue partsToProcess.append({ 'part': part, 'part_index': partIndex, 'document_id': ec.id }) partIndex += 1 logger.info(f"Processing {len(partsToProcess)} parts with model-aware chunking") totalParts = len(partsToProcess) # Process parts in parallel processedCount = [0] # Use list to allow modification in nested function async def processSinglePart(partInfo: Dict) -> PartResult: part = partInfo['part'] part_index = partInfo['part_index'] documentId = partInfo['document_id'] start_time = time.time() try: # Create AI call request with content part request = AiCallRequest( prompt=prompt, context="", # Context is in the content part options=options, contentParts=[part] # Pass as list for unified processing ) # Update progress before AI call if operationId and totalParts > 0: processedCount[0] += 1 progress = 0.3 + (processedCount[0] / totalParts * 0.6) # Progress from 0.3 to 0.9 self.services.workflow.progressLogUpdate(operationId, progress, f"Processing part {processedCount[0]}/{totalParts}") # Call AI with model-aware chunking response = await aiObjects.call(request) processing_time = time.time() - start_time return PartResult( originalPart=part, aiResult=response.content, partIndex=part_index, documentId=documentId, processingTime=processing_time, metadata={ "success": True, "partSize": len(part.data) if part.data else 0, "resultSize": len(response.content), "typeGroup": part.typeGroup, "modelName": response.modelName, "priceUsd": response.priceUsd } ) except Exception as e: processing_time = time.time() - start_time logger.warning(f"Error processing part {part_index}: {str(e)}") return PartResult( originalPart=part, aiResult=f"[Error processing part: {str(e)}]", partIndex=part_index, documentId=documentId, processingTime=processing_time, metadata={ "success": False, "error": str(e), "partSize": len(part.data) if part.data else 0, "typeGroup": part.typeGroup } ) # Process parts with concurrency control maxConcurrent = 5 if options and hasattr(options, 'maxConcurrentParts'): maxConcurrent = options.maxConcurrentParts semaphore = asyncio.Semaphore(maxConcurrent) async def processWithSemaphore(partInfo): async with semaphore: return await processSinglePart(partInfo) tasks = [processWithSemaphore(part_info) for part_info in partsToProcess] partResults = await asyncio.gather(*tasks, return_exceptions=True) # Handle exceptions processedResults = [] for i, result in enumerate(partResults): if isinstance(result, Exception): part_info = partsToProcess[i] processedResults.append(PartResult( originalPart=part_info['part'], aiResult=f"[Error in parallel processing: {str(result)}]", partIndex=part_info['part_index'], documentId=part_info['document_id'], processingTime=0.0, metadata={"success": False, "error": str(result)} )) elif result is not None: processedResults.append(result) logger.info(f"Completed processing {len(processedResults)} parts") return processedResults def _mergePartResults( self, partResults: List[PartResult], options: Optional[AiCallOptions] = None ) -> str: """Merge part results using existing sophisticated merging system.""" if not partResults: return "" # Convert PartResults back to ContentParts for existing merger system content_parts = [] for part_result in partResults: # Create ContentPart from PartResult with proper typeGroup content_part = ContentPart( id=part_result.originalPart.id, parentId=part_result.originalPart.parentId, label=part_result.originalPart.label, typeGroup=part_result.originalPart.typeGroup, # Use original typeGroup mimeType=part_result.originalPart.mimeType, data=part_result.aiResult, # Use AI result as data metadata={ **part_result.originalPart.metadata, "aiResult": True, "partIndex": part_result.partIndex, "documentId": part_result.documentId, "processingTime": part_result.processingTime, "success": part_result.metadata.get("success", False) } ) content_parts.append(content_part) # Use existing merging strategy from options merge_strategy = MergeStrategy( useIntelligentMerging=True, groupBy="documentId", # Group by document orderBy="partIndex", # Order by part index mergeType="concatenate" ) # Apply existing merging logic using the sophisticated merging system merged_parts = _applyMerging(content_parts, merge_strategy) # Convert merged parts back to final string final_content = "\n\n".join([part.data for part in merged_parts]) logger.info(f"Merged {len(partResults)} parts using existing sophisticated merging system") return final_content.strip()