gateway/modules/services/serviceExtraction/mainServiceExtraction.py

from typing import Any, Dict, List, Optional, Union
import uuid
import logging
import time
import asyncio

from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .subPipeline import runExtraction, _applyMerging
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions, PartResult
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallResponse, AiCallRequest, AiCallOptions, OperationTypeEnum
from modules.aicore.aicoreModelRegistry import modelRegistry


logger = logging.getLogger(__name__)


class ExtractionService:
    def __init__(self, services: Optional[Any] = None):
        self.services = services
        self._extractorRegistry = ExtractorRegistry()
        self._chunkerRegistry = ChunkerRegistry()
        # Ensure AI connectors are discovered so pricing models are available
        try:
            # If internal model is missing, trigger discovery and registration
            if modelRegistry.getModel("internal-extractor") is None:
                discovered = modelRegistry.discoverConnectors()
                for connector in discovered:
                    modelRegistry.registerConnector(connector)
        except Exception:
            # Propagate actual errors during use; init should be fast and side-effect free otherwise
            pass

    def extractContent(self, documents: List[ChatDocument], options: ExtractionOptions) -> List[ContentExtracted]:
        """
        Extract content from a list of ChatDocument objects.

        Args:
            documents: List of ChatDocument objects to extract content from
            options: Extraction options including maxSize, chunkAllowed, mergeStrategy, etc.

        Returns:
            List of ContentExtracted objects, one per input document
        """

        results: List[ContentExtracted] = []

        # Lazy import to avoid circular deps and heavy init at module import
        from modules.interfaces.interfaceDbComponentObjects import getInterface
        dbInterface = getInterface()

        for i, doc in enumerate(documents):
            logger.info(f"=== DOCUMENT {i}: {doc.fileName} ===")
            logger.info(f"Initial MIME type: {doc.mimeType}")

            # Start timing for this document
            startTime = time.time()

            # Resolve raw bytes for this document using interface
            documentBytes = dbInterface.getFileData(doc.fileId)
            if not documentBytes:
                raise ValueError(f"No file data found for fileId={doc.fileId}")

            # Convert ChatDocument to the format expected by runExtraction
            documentData = {
                "id": doc.id,
                "bytes": documentBytes,
                "fileName": doc.fileName,
                "mimeType": doc.mimeType
            }

            ec = runExtraction(
                extractorRegistry=self._extractorRegistry,
                chunkerRegistry=self._chunkerRegistry,
                documentBytes=documentData["bytes"],
                fileName=documentData["fileName"],
                mimeType=documentData["mimeType"],
                options=options
            )

            # Log content parts metadata
            logger.debug(f"Content parts: {len(ec.parts)}")
            for j, part in enumerate(ec.parts):
                logger.debug(f"  Part {j}: {part.typeGroup} ({part.mimeType}) - {len(part.data) if part.data else 0} chars")
                if part.metadata:
                    logger.debug(f"    Metadata: {part.metadata}")

            # Attach document id and MIME type to parts if missing
            for p in ec.parts:
                if "documentId" not in p.metadata:
                    p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
                if "documentMimeType" not in p.metadata:
                    p.metadata["documentMimeType"] = documentData["mimeType"]

            # Log chunking information
            chunkedParts = [p for p in ec.parts if p.metadata.get("chunk", False)]
            if chunkedParts:
                logger.debug(f"=== CHUNKING RESULTS ===")
                logger.debug(f"Total parts: {len(ec.parts)}")
                logger.debug(f"Chunked parts: {len(chunkedParts)}")
                for chunk in chunkedParts:
                    logger.debug(f"  Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})")
            else:
                logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits")

            # Calculate timing and emit stats
            endTime = time.time()
            processingTime = endTime - startTime
            bytesSent = len(documentBytes)
            bytesReceived = sum(len(part.data) if part.data else 0 for part in ec.parts)

            # Emit stats for extraction operation

            # Use internal extraction model for pricing
            modelName = "internal-extractor"
            model = modelRegistry.getModel(modelName)
            # Hard fail if model is missing; caller must ensure connectors are registered
            if model is None or model.calculatePriceUsd is None:
                raise RuntimeError(f"Pricing model not available: {modelName}")
            priceUsd = model.calculatePriceUsd(processingTime, bytesSent, bytesReceived)

            # Create AiCallResponse with real calculation
            aiResponse = AiCallResponse(
                content="",  # No content for extraction stats needed
                modelName=modelName,
                priceUsd=priceUsd,
                processingTime=processingTime,
                bytesSent=bytesSent,
                bytesReceived=bytesReceived,
                errorCount=0
            )

            self.services.workflow.storeWorkflowStat(
                self.services.currentWorkflow,
                aiResponse,
                f"extraction.process.{doc.mimeType}"
            )

            results.append(ec)

        return results

    def mergeAiResults(
        self,
        extractedContent: List[ContentExtracted],
        aiResults: List[str],
        strategy: MergeStrategy
        ) -> ContentExtracted:
        """
        Merge AI results from chunked content back into a single ContentExtracted.

        Args:
            extractedContent: List of ContentExtracted objects that were processed
            aiResults: List of AI response strings, one per chunk
            strategy: Merge strategy configuration (dict or MergeStrategy object)

        Returns:
            Single ContentExtracted with merged AI results
        """
        logger.debug(f"=== MERGING AI RESULTS ===")
        logger.debug(f"Extracted content: {len(extractedContent)} documents")
        logger.debug(f"AI results: {len(aiResults)} responses")
        logger.debug(f"Merge strategy: {strategy.mergeType}")

        mergeStrategy = strategy

        # Collect all parts from all extracted content
        allParts: List[ContentPart] = []
        for ec in extractedContent:
            allParts.extend(ec.parts)

        logger.debug(f"Total original parts: {len(allParts)}")

        # Create AI result parts
        aiResultParts: List[ContentPart] = []
        for i, aiResult in enumerate(aiResults):
            aiPart = ContentPart(
                id=f"ai_result_{i}",
                parentId=None,  # Will be set based on strategy
                label="ai_result",
                typeGroup="text",
                mimeType="text/plain",
                data=aiResult,
                metadata={
                    "aiResult": True,
                    "order": i,
                    "size": len(aiResult.encode('utf-8'))
                }
            )
            aiResultParts.append(aiPart)

        logger.debug(f"Created {len(aiResultParts)} AI result parts")

        # Apply merging strategy
        if mergeStrategy.mergeType == "concatenate":
            mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
        elif mergeStrategy.mergeType == "hierarchical":
            mergedParts = self._mergeHierarchical(allParts, aiResultParts, mergeStrategy)
        elif mergeStrategy.mergeType == "intelligent":
            mergedParts = self._mergeIntelligent(allParts, aiResultParts, mergeStrategy)
        else:
            # Default to concatenate
            mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)

        # Create final ContentExtracted
        mergedContent = ContentExtracted(
            id=f"merged_{uuid.uuid4()}",
            parts=mergedParts
        )

        logger.debug(f"=== MERGE COMPLETED ===")
        logger.debug(f"Final merged parts: {len(mergedParts)}")
        logger.debug(f"Merged content ID: {mergedContent.id}")

        return mergedContent

    def _mergeConcatenate(
        self,
        originalParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Merge parts by simple concatenation."""
        mergedParts = []

        # Add original parts (filtered if needed)
        for part in originalParts:
            if strategy.preserveChunks or not part.metadata.get("chunk", False):
                mergedParts.append(part)

        # Add AI results
        if aiResultParts:
            # Group AI results by parentId if available
            aiResultsByParent = {}
            for aiPart in aiResultParts:
                parentId = aiPart.parentId or "root"
                if parentId not in aiResultsByParent:
                    aiResultsByParent[parentId] = []
                aiResultsByParent[parentId].append(aiPart)

            # Merge AI results for each parent
            for parentId, aiParts in aiResultsByParent.items():
                if len(aiParts) == 1:
                    mergedParts.append(aiParts[0])
                else:
                    # Concatenate multiple AI results for same parent
                    combinedData = strategy.chunkSeparator.join([p.data for p in aiParts])
                    combinedPart = ContentPart(
                        id=f"merged_ai_{parentId}",
                        parentId=parentId if parentId != "root" else None,
                        label="merged_ai_result",
                        typeGroup="text",
                        mimeType="text/plain",
                        data=combinedData,
                        metadata={
                            "aiResult": True,
                            "merged": True,
                            "sourceCount": len(aiParts),
                            "size": len(combinedData.encode('utf-8'))
                        }
                    )
                    mergedParts.append(combinedPart)

        return mergedParts

    def _mergeHierarchical(
        self,
        originalParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Merge parts hierarchically based on parentId relationships."""
        # Group parts by parentId
        partsByParent = {}
        for part in originalParts:
            parentId = part.parentId or "root"
            if parentId not in partsByParent:
                partsByParent[parentId] = []
            partsByParent[parentId].append(part)

        # Group AI results by parentId
        aiResultsByParent = {}
        for aiPart in aiResultParts:
            parentId = aiPart.parentId or "root"
            if parentId not in aiResultsByParent:
                aiResultsByParent[parentId] = []
            aiResultsByParent[parentId].append(aiPart)

        mergedParts = []

        # Process each parent group
        for parentId in set(list(partsByParent.keys()) + list(aiResultsByParent.keys())):
            originalGroup = partsByParent.get(parentId, [])
            aiGroup = aiResultsByParent.get(parentId, [])

            # Add original parts
            mergedParts.extend(originalGroup)

            # Add AI results for this parent
            if aiGroup:
                if len(aiGroup) == 1:
                    mergedParts.append(aiGroup[0])
                else:
                    # Merge multiple AI results
                    combinedData = strategy.chunkSeparator.join([p.data for p in aiGroup])
                    combinedPart = ContentPart(
                        id=f"hierarchical_ai_{parentId}",
                        parentId=parentId if parentId != "root" else None,
                        label="hierarchical_ai_result",
                        typeGroup="text",
                        mimeType="text/plain",
                        data=combinedData,
                        metadata={
                            "aiResult": True,
                            "hierarchical": True,
                            "sourceCount": len(aiGroup),
                            "size": len(combinedData.encode('utf-8'))
                        }
                    )
                    mergedParts.append(combinedPart)

        return mergedParts

    def _mergeIntelligent(
        self,
        originalParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Merge parts using intelligent strategies based on content type."""
        mergedParts = []

        # Group by typeGroup for intelligent merging
        partsByType = {}
        for part in originalParts:
            typeGroup = part.typeGroup
            if typeGroup not in partsByType:
                partsByType[typeGroup] = []
            partsByType[typeGroup].append(part)

        # Process each type group
        for typeGroup, parts in partsByType.items():
            if typeGroup == "text":
                mergedParts.extend(self._mergeTextIntelligent(parts, aiResultParts, strategy))
            elif typeGroup == "table":
                mergedParts.extend(self._mergeTableIntelligent(parts, aiResultParts, strategy))
            elif typeGroup == "structure":
                mergedParts.extend(self._mergeStructureIntelligent(parts, aiResultParts, strategy))
            else:
                # Default handling for other types
                mergedParts.extend(parts)

        # Add any remaining AI results that weren't merged
        for aiPart in aiResultParts:
            if not any(p.id == aiPart.id for p in mergedParts):
                mergedParts.append(aiPart)

        return mergedParts

    def _mergeTextIntelligent(
        self,
        textParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Intelligent merging for text content."""
        # For now, use concatenate strategy
        # This could be enhanced with semantic analysis, summarization, etc.
        return self._mergeConcatenate(textParts, aiResultParts, strategy)

    def _mergeTableIntelligent(
        self,
        tableParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Intelligent merging for table content."""
        # For now, use concatenate strategy
        # This could be enhanced with table merging logic
        return self._mergeConcatenate(tableParts, aiResultParts, strategy)

    def _mergeStructureIntelligent(
        self,
        structureParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Intelligent merging for structured content."""
        # For now, use concatenate strategy
        # This could be enhanced with structure-aware merging
        return self._mergeConcatenate(structureParts, aiResultParts, strategy)

    async def processDocumentsPerChunk(
        self,
        documents: List[ChatDocument],
        prompt: str,
        aiObjects: Any,
        options: Optional[AiCallOptions] = None,
        operationId: Optional[str] = None
        ) -> str:
        """
        Process documents with model-aware chunking and merge results.
        NEW: Uses model-aware chunking in AI call phase instead of extraction phase.

        Args:
            documents: List of ChatDocument objects to process
            prompt: AI prompt for processing
            aiObjects: AiObjects instance for making AI calls
            options: AI call options
            operationId: Optional operation ID for progress tracking

        Returns:
            Merged AI results as string with preserved document structure
        """
        if not documents:
            return ""

        # Create operationId if not provided
        if not operationId:
            workflowId = self.services.currentWorkflow.id if self.services.currentWorkflow else f"no-workflow-{int(time.time())}"
            operationId = f"ai_text_extract_{workflowId}_{int(time.time())}"
            self.services.workflow.progressLogStart(
                operationId,
                "AI Text Extract",
                "Document Processing",
                f"Processing {len(documents)} documents"
            )

        try:
            # Build extraction options using Pydantic model
            mergeStrategy = MergeStrategy(
                useIntelligentMerging=True,
                prompt=prompt,
                groupBy="typeGroup",
                orderBy="id",
                mergeType="concatenate"
            )

            extractionOptions = ExtractionOptions(
                prompt=prompt,
                operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
                processDocumentsIndividually=True,
                mergeStrategy=mergeStrategy
            )

            logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")

            # Extract content WITHOUT chunking
            if operationId:
                self.services.workflow.progressLogUpdate(operationId, 0.1, f"Extracting content from {len(documents)} documents")
            extractionResult = self.extractContent(documents, extractionOptions)

            if not isinstance(extractionResult, list):
                if operationId:
                    self.services.workflow.progressLogFinish(operationId, False)
                return "[Error: No extraction results]"

            # Process parts (not chunks) with model-aware AI calls
            if operationId:
                self.services.workflow.progressLogUpdate(operationId, 0.3, f"Processing {len(extractionResult)} extracted content parts")
            partResults = await self._processPartsWithMapping(extractionResult, prompt, aiObjects, options, operationId)

            # Merge results using existing merging system
            if operationId:
                self.services.workflow.progressLogUpdate(operationId, 0.9, f"Merging {len(partResults)} part results")
            mergedContent = self._mergePartResults(partResults, options)

            # Save merged extraction content to debug
            self.services.utils.writeDebugFile(mergedContent or '', "extraction_merged_text")

            if operationId:
                self.services.workflow.progressLogFinish(operationId, True)

            return mergedContent
        except Exception as e:
            logger.error(f"Error in processDocumentsPerChunk: {str(e)}")
            if operationId:
                self.services.workflow.progressLogFinish(operationId, False)
            raise

    async def _processPartsWithMapping(
        self,
        extractionResult: List[ContentExtracted],
        prompt: str,
        aiObjects: Any,
        options: Optional[AiCallOptions] = None,
        operationId: Optional[str] = None
        ) -> List[PartResult]:
        """Process content parts with model-aware chunking and proper mapping."""

        # Collect all parts that need processing
        partsToProcess = []
        partIndex = 0

        for ec in extractionResult:
            for part in ec.parts:
                if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
                    # Skip empty container parts
                    if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
                        logger.debug(f"Skipping empty container part: mimeType={part.mimeType}")
                        continue

                    partsToProcess.append({
                        'part': part,
                        'part_index': partIndex,
                        'document_id': ec.id
                    })
                    partIndex += 1

        logger.info(f"Processing {len(partsToProcess)} parts with model-aware chunking")

        totalParts = len(partsToProcess)

        # Process parts in parallel
        processedCount = [0]  # Use list to allow modification in nested function

        async def processSinglePart(partInfo: Dict) -> PartResult:
            part = partInfo['part']
            part_index = partInfo['part_index']
            documentId = partInfo['document_id']

            start_time = time.time()

            try:
                # Create AI call request with content part
                request = AiCallRequest(
                    prompt=prompt,
                    context="",  # Context is in the content part
                    options=options,
                    contentParts=[part]  # Pass as list for unified processing
                )

                # Update progress before AI call
                if operationId and totalParts > 0:
                    processedCount[0] += 1
                    progress = 0.3 + (processedCount[0] / totalParts * 0.6)  # Progress from 0.3 to 0.9
                    self.services.workflow.progressLogUpdate(operationId, progress, f"Processing part {processedCount[0]}/{totalParts}")

                # Call AI with model-aware chunking
                response = await aiObjects.call(request)

                processing_time = time.time() - start_time

                return PartResult(
                    originalPart=part,
                    aiResult=response.content,
                    partIndex=part_index,
                    documentId=documentId,
                    processingTime=processing_time,
                    metadata={
                        "success": True,
                        "partSize": len(part.data) if part.data else 0,
                        "resultSize": len(response.content),
                        "typeGroup": part.typeGroup,
                        "modelName": response.modelName,
                        "priceUsd": response.priceUsd
                    }
                )

            except Exception as e:
                processing_time = time.time() - start_time
                logger.warning(f"Error processing part {part_index}: {str(e)}")

                return PartResult(
                    originalPart=part,
                    aiResult=f"[Error processing part: {str(e)}]",
                    partIndex=part_index,
                    documentId=documentId,
                    processingTime=processing_time,
                    metadata={
                        "success": False,
                        "error": str(e),
                        "partSize": len(part.data) if part.data else 0,
                        "typeGroup": part.typeGroup
                    }
                )

        # Process parts with concurrency control
        maxConcurrent = 5
        if options and hasattr(options, 'maxConcurrentParts'):
            maxConcurrent = options.maxConcurrentParts

        semaphore = asyncio.Semaphore(maxConcurrent)

        async def processWithSemaphore(partInfo):
            async with semaphore:
                return await processSinglePart(partInfo)

        tasks = [processWithSemaphore(part_info) for part_info in partsToProcess]
        partResults = await asyncio.gather(*tasks, return_exceptions=True)

        # Handle exceptions
        processedResults = []
        for i, result in enumerate(partResults):
            if isinstance(result, Exception):
                part_info = partsToProcess[i]
                processedResults.append(PartResult(
                    originalPart=part_info['part'],
                    aiResult=f"[Error in parallel processing: {str(result)}]",
                    partIndex=part_info['part_index'],
                    documentId=part_info['document_id'],
                    processingTime=0.0,
                    metadata={"success": False, "error": str(result)}
                ))
            elif result is not None:
                processedResults.append(result)

        logger.info(f"Completed processing {len(processedResults)} parts")
        return processedResults

    def _mergePartResults(
        self,
        partResults: List[PartResult],
        options: Optional[AiCallOptions] = None
        ) -> str:
        """Merge part results using existing sophisticated merging system."""
        if not partResults:
            return ""

        # Convert PartResults back to ContentParts for existing merger system
        content_parts = []
        for part_result in partResults:
            # Create ContentPart from PartResult with proper typeGroup
            content_part = ContentPart(
                id=part_result.originalPart.id,
                parentId=part_result.originalPart.parentId,
                label=part_result.originalPart.label,
                typeGroup=part_result.originalPart.typeGroup,  # Use original typeGroup
                mimeType=part_result.originalPart.mimeType,
                data=part_result.aiResult,  # Use AI result as data
                metadata={
                    **part_result.originalPart.metadata,
                    "aiResult": True,
                    "partIndex": part_result.partIndex,
                    "documentId": part_result.documentId,
                    "processingTime": part_result.processingTime,
                    "success": part_result.metadata.get("success", False)
                }
            )
            content_parts.append(content_part)

        # Use existing merging strategy from options
        merge_strategy = MergeStrategy(
            useIntelligentMerging=True,
            groupBy="documentId",  # Group by document
            orderBy="partIndex",   # Order by part index
            mergeType="concatenate"
        )


        # Apply existing merging logic using the sophisticated merging system
        merged_parts = _applyMerging(content_parts, merge_strategy)

        # Convert merged parts back to final string
        final_content = "\n\n".join([part.data for part in merged_parts])

        logger.info(f"Merged {len(partResults)} parts using existing sophisticated merging system")
        return final_content.strip()