gateway/modules/services/serviceExtraction/mainServiceExtraction.py

from typing import Any, Dict, List, Optional, Union
import uuid
import logging
import time
import asyncio
import base64

from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions, PartResult
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallResponse, AiCallRequest, AiCallOptions, OperationTypeEnum, AiModelCall
from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.aicore.aicoreModelSelector import modelSelector


logger = logging.getLogger(__name__)


class ExtractionService:
    def __init__(self, services: Optional[Any] = None):
        self.services = services
        self._extractorRegistry = ExtractorRegistry()
        self._chunkerRegistry = ChunkerRegistry()

        # Ensure connectors are registered
        discovered = modelRegistry.discoverConnectors()
        for connector in discovered:
            modelRegistry.registerConnector(connector)

        # Verify required internal model is available (used for pricing in extractContent)
        modelDisplayName = "Internal Document Extractor"
        model = modelRegistry.getModel(modelDisplayName)
        if model is None or model.calculatePriceUsd is None:
            raise RuntimeError(f"FATAL: Required internal model '{modelDisplayName}' is not available. Check connector registration.")

    def extractContent(
        self,
        documents: List[ChatDocument],
        options: ExtractionOptions,
        operationId: Optional[str] = None,
        parentOperationId: Optional[str] = None
    ) -> List[ContentExtracted]:
        """
        Extract content from a list of ChatDocument objects.

        Args:
            documents: List of ChatDocument objects to extract content from
            options: Extraction options including maxSize, chunkAllowed, mergeStrategy, etc.
            operationId: Optional operation ID for progress logging (parent operation)
            parentOperationId: Optional parent operation ID for hierarchical logging

        Returns:
            List of ContentExtracted objects, one per input document
        """

        results: List[ContentExtracted] = []

        # Lazy import to avoid circular deps and heavy init at module import
        from modules.interfaces.interfaceDbComponentObjects import getInterface
        dbInterface = getInterface()

        totalDocs = len(documents)

        for i, doc in enumerate(documents):
            logger.info(f"=== DOCUMENT {i + 1}/{totalDocs}: {doc.fileName} ===")
            logger.info(f"Initial MIME type: {doc.mimeType}")

            # Create child operation for this document if parent operationId is provided
            docOperationId = None
            if operationId:
                workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
                docOperationId = f"{operationId}_doc_{i}"
                self.services.chat.progressLogStart(
                    docOperationId,
                    "Extracting Document",
                    f"Document {i + 1}/{totalDocs}",
                    doc.fileName[:50] + "..." if len(doc.fileName) > 50 else doc.fileName,
                    parentOperationId=operationId  # Use operationId as parent (not parentOperationId)
                )

            # Start timing for this document
            startTime = time.time()

            try:
                if docOperationId:
                    self.services.chat.progressLogUpdate(docOperationId, 0.1, "Loading document data")

                # Resolve raw bytes for this document using interface
                documentBytes = dbInterface.getFileData(doc.fileId)
                if not documentBytes:
                    if docOperationId:
                        self.services.chat.progressLogFinish(docOperationId, False)
                    raise ValueError(f"No file data found for fileId={doc.fileId}")

                if docOperationId:
                    self.services.chat.progressLogUpdate(docOperationId, 0.2, "Running extraction pipeline")

                # Convert ChatDocument to the format expected by runExtraction
                documentData = {
                    "id": doc.id,
                    "bytes": documentBytes,
                    "fileName": doc.fileName,
                    "mimeType": doc.mimeType
                }

                ec = runExtraction(
                    extractorRegistry=self._extractorRegistry,
                    chunkerRegistry=self._chunkerRegistry,
                    documentBytes=documentData["bytes"],
                    fileName=documentData["fileName"],
                    mimeType=documentData["mimeType"],
                    options=options
                )

                if docOperationId:
                    self.services.chat.progressLogUpdate(docOperationId, 0.7, f"Extracted {len(ec.parts)} parts")

                # Log content parts metadata
                logger.debug(f"Content parts: {len(ec.parts)}")
                for j, part in enumerate(ec.parts):
                    logger.debug(f"  Part {j + 1}/{len(ec.parts)}: {part.typeGroup} ({part.mimeType}) - {len(part.data) if part.data else 0} chars")
                    if part.metadata:
                        logger.debug(f"    Metadata: {part.metadata}")

                # Attach document id and MIME type to parts if missing
                for p in ec.parts:
                    if "documentId" not in p.metadata:
                        p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
                    if "documentMimeType" not in p.metadata:
                        p.metadata["documentMimeType"] = documentData["mimeType"]

                # Log chunking information
                chunkedParts = [p for p in ec.parts if p.metadata.get("chunk", False)]
                if chunkedParts:
                    logger.debug(f"=== CHUNKING RESULTS ===")
                    logger.debug(f"Total parts: {len(ec.parts)}")
                    logger.debug(f"Chunked parts: {len(chunkedParts)}")
                    for chunk in chunkedParts:
                        logger.debug(f"  Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})")
                else:
                    logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits")

                if docOperationId:
                    self.services.chat.progressLogUpdate(docOperationId, 0.9, f"Processing complete: {len(ec.parts)} parts extracted")

                # Calculate timing and emit stats
                endTime = time.time()
                processingTime = endTime - startTime
                bytesSent = len(documentBytes)
                bytesReceived = sum(len(part.data) if part.data else 0 for part in ec.parts)

                # Emit stats for extraction operation

                # Use internal extraction model for pricing
                modelDisplayName = "Internal Document Extractor"
                model = modelRegistry.getModel(modelDisplayName)
                # Hard fail if model is missing; caller must ensure connectors are registered
                if model is None or model.calculatePriceUsd is None:
                    if docOperationId:
                        self.services.chat.progressLogFinish(docOperationId, False)
                    raise RuntimeError(f"Pricing model not available: {modelDisplayName}")
                priceUsd = model.calculatePriceUsd(processingTime, bytesSent, bytesReceived)

                # Create AiCallResponse with real calculation
                # Use model.name for the response (API identifier), not displayName
                aiResponse = AiCallResponse(
                    content="",  # No content for extraction stats needed
                    modelName=model.name,
                    priceUsd=priceUsd,
                    processingTime=processingTime,
                    bytesSent=bytesSent,
                    bytesReceived=bytesReceived,
                    errorCount=0
                )

                self.services.chat.storeWorkflowStat(
                    self.services.workflow,
                    aiResponse,
                    f"extraction.process.{doc.mimeType}"
                )

                # Write extraction results to debug file
                try:
                    from modules.shared.debugLogger import writeDebugFile
                    import json
                    # Create summary of extraction results for debug
                    extractionSummary = {
                        "documentName": doc.fileName,
                        "documentMimeType": doc.mimeType,
                        "partsCount": len(ec.parts),
                        "parts": []
                    }
                    for part in ec.parts:
                        partSummary = {
                            "typeGroup": part.typeGroup,
                            "mimeType": part.mimeType,
                            "label": part.label,
                            "dataLength": len(part.data) if part.data else 0,
                            "metadata": part.metadata
                        }
                        # Include data preview for small parts (first 500 chars)
                        if part.data and len(part.data) <= 500:
                            partSummary["dataPreview"] = part.data[:500]
                        elif part.data:
                            partSummary["dataPreview"] = f"[Large data: {len(part.data)} chars - truncated]"
                        extractionSummary["parts"].append(partSummary)

                    writeDebugFile(json.dumps(extractionSummary, indent=2, ensure_ascii=False), f"extraction_result_{doc.fileName}")
                except Exception as e:
                    logger.debug(f"Failed to write extraction debug file: {str(e)}")

                results.append(ec)

                # Finish document operation successfully
                if docOperationId:
                    self.services.chat.progressLogFinish(docOperationId, True)

            except Exception as e:
                logger.error(f"Error extracting content from document {i + 1}/{totalDocs} ({doc.fileName}): {str(e)}")
                if docOperationId:
                    try:
                        self.services.chat.progressLogFinish(docOperationId, False)
                    except:
                        pass  # Don't fail on progress logging errors
                # Continue with next document instead of failing completely
                # This allows parallel processing to continue even if one document fails
                continue

        return results

    def mergeAiResults(
        self,
        extractedContent: List[ContentExtracted],
        aiResults: List[str],
        strategy: MergeStrategy
        ) -> ContentExtracted:
        """
        Merge AI results from chunked content back into a single ContentExtracted.

        Args:
            extractedContent: List of ContentExtracted objects that were processed
            aiResults: List of AI response strings, one per chunk
            strategy: Merge strategy configuration (dict or MergeStrategy object)

        Returns:
            Single ContentExtracted with merged AI results
        """
        logger.debug(f"=== MERGING AI RESULTS ===")
        logger.debug(f"Extracted content: {len(extractedContent)} documents")
        logger.debug(f"AI results: {len(aiResults)} responses")
        logger.debug(f"Merge strategy: {strategy.mergeType}")

        mergeStrategy = strategy

        # Collect all parts from all extracted content
        allParts: List[ContentPart] = []
        for ec in extractedContent:
            allParts.extend(ec.parts)

        logger.debug(f"Total original parts: {len(allParts)}")

        # Create AI result parts
        aiResultParts: List[ContentPart] = []
        for i, aiResult in enumerate(aiResults):
            aiPart = ContentPart(
                id=f"ai_result_{i}",
                parentId=None,  # Will be set based on strategy
                label="ai_result",
                typeGroup="text",
                mimeType="text/plain",
                data=aiResult,
                metadata={
                    "aiResult": True,
                    "order": i,
                    "size": len(aiResult.encode('utf-8'))
                }
            )
            aiResultParts.append(aiPart)

        logger.debug(f"Created {len(aiResultParts)} AI result parts")

        # Apply merging strategy
        if mergeStrategy.mergeType == "concatenate":
            mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
        elif mergeStrategy.mergeType == "hierarchical":
            mergedParts = self._mergeHierarchical(allParts, aiResultParts, mergeStrategy)
        elif mergeStrategy.mergeType == "intelligent":
            mergedParts = self._mergeIntelligent(allParts, aiResultParts, mergeStrategy)
        else:
            # Default to concatenate
            mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)

        # Create final ContentExtracted
        mergedContent = ContentExtracted(
            id=f"merged_{uuid.uuid4()}",
            parts=mergedParts
        )

        logger.debug(f"=== MERGE COMPLETED ===")
        logger.debug(f"Final merged parts: {len(mergedParts)}")
        logger.debug(f"Merged content ID: {mergedContent.id}")

        return mergedContent

    def _mergeConcatenate(
        self,
        originalParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Merge parts by simple concatenation."""
        mergedParts = []

        # Add original parts (filtered if needed)
        for part in originalParts:
            if strategy.preserveChunks or not part.metadata.get("chunk", False):
                mergedParts.append(part)

        # Add AI results
        if aiResultParts:
            # Group AI results by parentId if available
            aiResultsByParent = {}
            for aiPart in aiResultParts:
                parentId = aiPart.parentId or "root"
                if parentId not in aiResultsByParent:
                    aiResultsByParent[parentId] = []
                aiResultsByParent[parentId].append(aiPart)

            # Merge AI results for each parent
            for parentId, aiParts in aiResultsByParent.items():
                if len(aiParts) == 1:
                    mergedParts.append(aiParts[0])
                else:
                    # Concatenate multiple AI results for same parent
                    combinedData = strategy.chunkSeparator.join([p.data for p in aiParts])
                    combinedPart = ContentPart(
                        id=f"merged_ai_{parentId}",
                        parentId=parentId if parentId != "root" else None,
                        label="merged_ai_result",
                        typeGroup="text",
                        mimeType="text/plain",
                        data=combinedData,
                        metadata={
                            "aiResult": True,
                            "merged": True,
                            "sourceCount": len(aiParts),
                            "size": len(combinedData.encode('utf-8'))
                        }
                    )
                    mergedParts.append(combinedPart)

        return mergedParts

    def _mergeHierarchical(
        self,
        originalParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Merge parts hierarchically based on parentId relationships."""
        # Group parts by parentId
        partsByParent = {}
        for part in originalParts:
            parentId = part.parentId or "root"
            if parentId not in partsByParent:
                partsByParent[parentId] = []
            partsByParent[parentId].append(part)

        # Group AI results by parentId
        aiResultsByParent = {}
        for aiPart in aiResultParts:
            parentId = aiPart.parentId or "root"
            if parentId not in aiResultsByParent:
                aiResultsByParent[parentId] = []
            aiResultsByParent[parentId].append(aiPart)

        mergedParts = []

        # Process each parent group
        for parentId in set(list(partsByParent.keys()) + list(aiResultsByParent.keys())):
            originalGroup = partsByParent.get(parentId, [])
            aiGroup = aiResultsByParent.get(parentId, [])

            # Add original parts
            mergedParts.extend(originalGroup)

            # Add AI results for this parent
            if aiGroup:
                if len(aiGroup) == 1:
                    mergedParts.append(aiGroup[0])
                else:
                    # Merge multiple AI results
                    combinedData = strategy.chunkSeparator.join([p.data for p in aiGroup])
                    combinedPart = ContentPart(
                        id=f"hierarchical_ai_{parentId}",
                        parentId=parentId if parentId != "root" else None,
                        label="hierarchical_ai_result",
                        typeGroup="text",
                        mimeType="text/plain",
                        data=combinedData,
                        metadata={
                            "aiResult": True,
                            "hierarchical": True,
                            "sourceCount": len(aiGroup),
                            "size": len(combinedData.encode('utf-8'))
                        }
                    )
                    mergedParts.append(combinedPart)

        return mergedParts

    def _mergeIntelligent(
        self,
        originalParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Merge parts using intelligent strategies based on content type."""
        mergedParts = []

        # Group by typeGroup for intelligent merging
        partsByType = {}
        for part in originalParts:
            typeGroup = part.typeGroup
            if typeGroup not in partsByType:
                partsByType[typeGroup] = []
            partsByType[typeGroup].append(part)

        # Process each type group
        for typeGroup, parts in partsByType.items():
            if typeGroup == "text":
                mergedParts.extend(self._mergeTextIntelligent(parts, aiResultParts, strategy))
            elif typeGroup == "table":
                mergedParts.extend(self._mergeTableIntelligent(parts, aiResultParts, strategy))
            elif typeGroup == "structure":
                mergedParts.extend(self._mergeStructureIntelligent(parts, aiResultParts, strategy))
            else:
                # Default handling for other types
                mergedParts.extend(parts)

        # Add any remaining AI results that weren't merged
        for aiPart in aiResultParts:
            if not any(p.id == aiPart.id for p in mergedParts):
                mergedParts.append(aiPart)

        return mergedParts

    def _mergeTextIntelligent(
        self,
        textParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Intelligent merging for text content."""
        # For now, use concatenate strategy
        # This could be enhanced with semantic analysis, summarization, etc.
        return self._mergeConcatenate(textParts, aiResultParts, strategy)

    def _mergeTableIntelligent(
        self,
        tableParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Intelligent merging for table content."""
        # For now, use concatenate strategy
        # This could be enhanced with table merging logic
        return self._mergeConcatenate(tableParts, aiResultParts, strategy)

    def _mergeStructureIntelligent(
        self,
        structureParts: List[ContentPart],
        aiResultParts: List[ContentPart],
        strategy: MergeStrategy
        ) -> List[ContentPart]:
        """Intelligent merging for structured content."""
        # For now, use concatenate strategy
        # This could be enhanced with structure-aware merging
        return self._mergeConcatenate(structureParts, aiResultParts, strategy)

    async def processDocumentsPerChunk(
        self,
        documents: List[ChatDocument],
        prompt: str,
        aiObjects: Any,
        options: Optional[AiCallOptions] = None,
        operationId: Optional[str] = None
        ) -> str:
        """
        Process documents with model-aware chunking and merge results.
        NEW: Uses model-aware chunking in AI call phase instead of extraction phase.

        Args:
            documents: List of ChatDocument objects to process
            prompt: AI prompt for processing
            aiObjects: AiObjects instance for making AI calls
            options: AI call options
            operationId: Optional operation ID for progress tracking

        Returns:
            Merged AI results as string with preserved document structure
        """
        if not documents:
            return ""

        # Create operationId if not provided
        if not operationId:
            workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
            operationId = f"ai_text_extract_{workflowId}_{int(time.time())}"
            self.services.chat.progressLogStart(
                operationId,
                "AI Text Extract",
                "Document Processing",
                f"Processing {len(documents)} documents"
            )

        try:
            # Build extraction options using Pydantic model
            mergeStrategy = MergeStrategy(
                useIntelligentMerging=True,
                prompt=prompt,
                groupBy="typeGroup",
                orderBy="id",
                mergeType="concatenate"
            )

            extractionOptions = ExtractionOptions(
                prompt=prompt,
                processDocumentsIndividually=True,
                mergeStrategy=mergeStrategy
            )

            logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars")

            # Extract content WITHOUT chunking
            if operationId:
                self.services.chat.progressLogUpdate(operationId, 0.1, f"Extracting content from {len(documents)} documents")
            # Pass operationId as parentOperationId for hierarchical logging
            extractionResult = self.extractContent(documents, extractionOptions, operationId=operationId, parentOperationId=parentOperationId)

            if not isinstance(extractionResult, list):
                if operationId:
                    self.services.chat.progressLogFinish(operationId, False)
                return "[Error: No extraction results]"

            # Process parts (not chunks) with model-aware AI calls
            if operationId:
                self.services.chat.progressLogUpdate(operationId, 0.3, f"Processing {len(extractionResult)} extracted content parts")
            # Use parent operation ID directly (parentId should be operationId, not log entry ID)
            parentOperationId = operationId  # Use the parent's operationId directly
            partResults = await self._processPartsWithMapping(extractionResult, prompt, aiObjects, options, operationId, parentOperationId)

            # Merge results using existing merging system
            if operationId:
                self.services.chat.progressLogUpdate(operationId, 0.9, f"Merging {len(partResults)} part results")
            mergedContent = self.mergePartResults(partResults, options)

            # Save merged extraction content to debug
            self.services.utils.writeDebugFile(mergedContent or '', "extraction_merged_text")

            if operationId:
                self.services.chat.progressLogFinish(operationId, True)

            return mergedContent
        except Exception as e:
            logger.error(f"Error in processDocumentsPerChunk: {str(e)}")
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            raise

    async def _processPartsWithMapping(
        self,
        extractionResult: List[ContentExtracted],
        prompt: str,
        aiObjects: Any,
        options: Optional[AiCallOptions] = None,
        operationId: Optional[str] = None,
        parentOperationId: Optional[str] = None
        ) -> List[PartResult]:
        """Process content parts with model-aware chunking and proper mapping."""

        # Collect all parts that need processing
        partsToProcess = []
        partIndex = 0

        for ec in extractionResult:
            for part in ec.parts:
                if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
                    # Skip empty container parts
                    if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
                        logger.debug(f"Skipping empty container part: mimeType={part.mimeType}")
                        continue

                    partsToProcess.append({
                        'part': part,
                        'part_index': partIndex,
                        'document_id': ec.id
                    })
                    partIndex += 1

        logger.info(f"Processing {len(partsToProcess)} parts with model-aware chunking")

        totalParts = len(partsToProcess)

        # Process parts in parallel
        processedCount = [0]  # Use list to allow modification in nested function

        async def processSinglePart(partInfo: Dict) -> PartResult:
            part = partInfo['part']
            part_index = partInfo['part_index']
            documentId = partInfo['document_id']

            start_time = time.time()

            # Create separate operation for each part with parent reference
            partOperationId = None
            if operationId:
                workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
                partOperationId = f"{operationId}_part_{part_index}"
                self.services.chat.progressLogStart(
                    partOperationId,
                    "Content Processing",
                    f"Part {part_index + 1}",
                    f"Type: {part.typeGroup}",
                    parentOperationId=parentOperationId
                )

            try:
                # Create AI call request with content part
                request = AiCallRequest(
                    prompt=prompt,
                    context="",  # Context is in the content part
                    options=options,
                    contentParts=[part]  # Pass as list for unified processing
                )

                # Update progress - initiating
                if partOperationId:
                    self.services.chat.progressLogUpdate(partOperationId, 0.3, "Initiating")

                # Call AI with model-aware chunking (no progress callback - handled by parent operation)
                response = await aiObjects.call(request)

                # Update progress - completed
                if partOperationId:
                    self.services.chat.progressLogUpdate(partOperationId, 0.9, "Completed")
                    self.services.chat.progressLogFinish(partOperationId, True)

                processing_time = time.time() - start_time

                return PartResult(
                    originalPart=part,
                    aiResult=response.content,
                    partIndex=part_index,
                    documentId=documentId,
                    processingTime=processing_time,
                    metadata={
                        "success": True,
                        "partSize": len(part.data) if part.data else 0,
                        "resultSize": len(response.content),
                        "typeGroup": part.typeGroup,
                        "modelName": response.modelName,
                        "priceUsd": response.priceUsd
                    }
                )

            except Exception as e:
                processing_time = time.time() - start_time
                logger.warning(f"Error processing part {part_index}: {str(e)}")

                return PartResult(
                    originalPart=part,
                    aiResult=f"[Error processing part: {str(e)}]",
                    partIndex=part_index,
                    documentId=documentId,
                    processingTime=processing_time,
                    metadata={
                        "success": False,
                        "error": str(e),
                        "partSize": len(part.data) if part.data else 0,
                        "typeGroup": part.typeGroup
                    }
                )

        # Process parts with concurrency control
        maxConcurrent = 5
        if options and hasattr(options, 'maxConcurrentParts'):
            maxConcurrent = options.maxConcurrentParts

        semaphore = asyncio.Semaphore(maxConcurrent)

        async def processWithSemaphore(partInfo):
            async with semaphore:
                return await processSinglePart(partInfo)

        tasks = [processWithSemaphore(part_info) for part_info in partsToProcess]
        partResults = await asyncio.gather(*tasks, return_exceptions=True)

        # Handle exceptions
        processedResults = []
        for i, result in enumerate(partResults):
            if isinstance(result, Exception):
                part_info = partsToProcess[i]
                processedResults.append(PartResult(
                    originalPart=part_info['part'],
                    aiResult=f"[Error in parallel processing: {str(result)}]",
                    partIndex=part_info['part_index'],
                    documentId=part_info['document_id'],
                    processingTime=0.0,
                    metadata={"success": False, "error": str(result)}
                ))
            elif result is not None:
                processedResults.append(result)

        logger.info(f"Completed processing {len(processedResults)} parts")
        return processedResults

    def _convertToContentParts(
        self, partResults: Union[List[PartResult], List[AiCallResponse]]
    ) -> List[ContentPart]:
        """Convert part results to ContentParts (internal helper for consolidation).

        Handles both PartResult (from extraction workflow) and AiCallResponse (from content parts processing).
        """
        content_parts = []

        if not partResults:
            return content_parts

        # Detect input type and convert accordingly
        if isinstance(partResults[0], PartResult):
            # Existing logic for PartResult (from processDocumentsPerChunk)
            for part_result in partResults:
                content_part = ContentPart(
                    id=part_result.originalPart.id,
                    parentId=part_result.originalPart.parentId,
                    label=part_result.originalPart.label,
                    typeGroup=part_result.originalPart.typeGroup,  # Use original typeGroup
                    mimeType=part_result.originalPart.mimeType,
                    data=part_result.aiResult,  # Use AI result as data
                    metadata={
                        **part_result.originalPart.metadata,
                        "aiResult": True,
                        "partIndex": part_result.partIndex,
                        "documentId": part_result.documentId,
                        "processingTime": part_result.processingTime,
                        "success": part_result.metadata.get("success", False)
                    }
                )
                content_parts.append(content_part)
        elif isinstance(partResults[0], AiCallResponse):
            # Logic from interfaceAiObjects (from content parts processing)
            for i, result in enumerate(partResults):
                if result.content:
                    content_part = ContentPart(
                        id=str(uuid.uuid4()),
                        parentId=None,
                        label=f"ai_result_{i}",
                        typeGroup="text",  # Default to text for AI results
                        mimeType="text/plain",
                        data=result.content,
                        metadata={
                            "aiResult": True,
                            "modelName": result.modelName,
                            "priceUsd": result.priceUsd,
                            "processingTime": result.processingTime,
                            "bytesSent": result.bytesSent,
                            "bytesReceived": result.bytesReceived
                        }
                    )
                    content_parts.append(content_part)

        return content_parts

    def mergePartResults(
        self,
        partResults: Union[List[PartResult], List[AiCallResponse]],
        options: Optional[AiCallOptions] = None
    ) -> str:
        """Unified merge for both PartResult and AiCallResponse.

        Consolidated from both interfaceAiObjects.py and existing serviceExtraction method.
        """
        if not partResults:
            return ""

        # Convert to ContentParts using unified helper
        content_parts = self._convertToContentParts(partResults)

        # Determine merge strategy based on input type
        if isinstance(partResults[0], PartResult):
            # Use strategy for extraction workflow (group by document, order by part index)
            merge_strategy = MergeStrategy(
                useIntelligentMerging=True,
                groupBy="documentId",  # Group by document
                orderBy="partIndex",   # Order by part index
                mergeType="concatenate"
            )
        else:
            # Default strategy for content parts workflow
            merge_strategy = MergeStrategy(
                useIntelligentMerging=True,
                groupBy="typeGroup",
                orderBy="id",
                mergeType="concatenate"
            )

        # Apply merging
        merged_parts = applyMerging(content_parts, merge_strategy)

        # Convert back to string
        final_content = "\n\n".join([part.data for part in merged_parts])

        logger.info(f"Merged {len(partResults)} parts using unified merging system")
        return final_content.strip()

    async def chunkContentPartForAi(self, contentPart, model, options, prompt: str = "") -> List[Dict[str, Any]]:
        """Chunk a content part based on model capabilities, accounting for prompt, system message overhead, and maxTokens output.

        Moved from interfaceAiObjects.py - model-aware chunking for AI processing.
        Complementary to existing size-based chunking in extraction pipeline.
        """
        # Calculate model-specific chunk sizes
        modelContextTokens = model.contextLength  # Total context in tokens
        modelMaxOutputTokens = model.maxTokens  # Maximum output tokens

        # Reserve tokens for:
        # 1. Prompt (user message)
        promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0

        # 2. System message wrapper ("Context from documents:\n")
        systemMessageTokens = 10  # ~40 bytes = 10 tokens

        # 3. Max output tokens (model will reserve space for completion)
        outputTokens = modelMaxOutputTokens

        # 4. JSON structure and message overhead (~100 tokens)
        messageOverheadTokens = 100

        # Total reserved tokens = input overhead + output reservation
        totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens

        # Available tokens for content = context length - reserved tokens
        # Use 80% of available for safety margin
        availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)

        # Ensure we have at least some space
        if availableContentTokens < 100:
            logger.warning(f"Very limited space for content: {availableContentTokens} tokens available. Model: {model.name}, contextLength: {modelContextTokens}, maxTokens: {modelMaxOutputTokens}, prompt: {promptTokens:.0f} tokens")
            availableContentTokens = max(100, int(modelContextTokens * 0.1))  # Fallback to 10% of context

        # Convert tokens to bytes (1 token ≈ 4 bytes)
        availableContentBytes = availableContentTokens * 4

        logger.debug(f"Chunking calculation for {model.name}: contextLength={modelContextTokens} tokens, maxTokens={modelMaxOutputTokens} tokens, prompt={promptTokens:.0f} tokens, reserved={totalReservedTokens:.0f} tokens, available={availableContentTokens} tokens ({availableContentBytes} bytes)")

        # Use 70% of available content bytes for text chunks (conservative)
        textChunkSize = int(availableContentBytes * 0.7)
        imageChunkSize = int(availableContentBytes * 0.8)  # 80% for image chunks

        # Build chunking options
        chunkingOptions = {
            "textChunkSize": textChunkSize,
            "imageChunkSize": imageChunkSize,
            "maxSize": availableContentBytes,
            "chunkAllowed": True
        }

        # Get appropriate chunker (uses existing ChunkerRegistry ✅)
        chunker = self._chunkerRegistry.resolve(contentPart.typeGroup)

        if not chunker:
            logger.warning(f"No chunker found for typeGroup: {contentPart.typeGroup}")
            return []

        # Chunk the content part
        try:
            chunks = chunker.chunk(contentPart, chunkingOptions)
            logger.debug(f"Created {len(chunks)} chunks for {contentPart.typeGroup} part")
            return chunks
        except Exception as e:
            logger.error(f"Chunking failed for {contentPart.typeGroup}: {str(e)}")
            return []

    async def processContentPartWithFallback(self, contentPart, prompt: str, options, failoverModelList, aiObjects, progressCallback=None) -> AiCallResponse:
        """Process a single content part with model-aware chunking and fallback.

        Moved from interfaceAiObjects.py - orchestrates chunking and merging.
        Calls aiObjects._callWithModel() for actual AI calls.
        """
        lastError = None

        # Check if this is an image - Vision models need special handling
        isImage = (contentPart.typeGroup == "image") or (contentPart.mimeType and contentPart.mimeType.startswith("image/"))

        # Determine the correct operation type based on content type
        actualOperationType = options.operationType
        if isImage:
            actualOperationType = OperationTypeEnum.IMAGE_ANALYSE
            # Get vision-capable models for images
            availableModels = modelRegistry.getAvailableModels()
            visionFailoverList = modelSelector.getFailoverModelList(prompt, "", AiCallOptions(operationType=actualOperationType), availableModels)
            if visionFailoverList:
                logger.debug(f"Using {len(visionFailoverList)} vision-capable models for image processing")
                failoverModelList = visionFailoverList

        for attempt, model in enumerate(failoverModelList):
            try:
                logger.info(f"Processing content part with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")

                # Special handling for images with Vision models
                if isImage and hasattr(model, 'functionCall'):
                    try:
                        if not contentPart.data:
                            raise ValueError("Image content part has no data")

                        mimeType = contentPart.mimeType or "image/jpeg"
                        if not mimeType.startswith("image/"):
                            raise ValueError(f"Invalid mimeType for image: {mimeType}")

                        # Prepare base64 data
                        if isinstance(contentPart.data, str):
                            try:
                                base64.b64decode(contentPart.data, validate=True)
                                base64Data = contentPart.data
                            except Exception as e:
                                raise ValueError(f"Invalid base64 data in contentPart: {str(e)}")
                        elif isinstance(contentPart.data, bytes):
                            base64Data = base64.b64encode(contentPart.data).decode('utf-8')
                        else:
                            raise ValueError(f"Unsupported data type for image: {type(contentPart.data)}")

                        imageDataUrl = f"data:{mimeType};base64,{base64Data}"

                        modelCall = AiModelCall(
                            messages=[
                                {
                                    "role": "user",
                                    "content": [
                                        {"type": "text", "text": prompt or ""},
                                        {
                                            "type": "image_url",
                                            "image_url": {"url": imageDataUrl}
                                        }
                                    ]
                                }
                            ],
                            model=model,
                            options=AiCallOptions(operationType=actualOperationType)
                        )

                        modelResponse = await model.functionCall(modelCall)

                        if not modelResponse.success:
                            raise ValueError(f"Model call failed: {modelResponse.error}")

                        logger.info(f"✅ Image content part processed successfully with model: {model.name}")

                        processingTime = getattr(modelResponse, 'processingTime', None) or 0.0

                        return AiCallResponse(
                            content=modelResponse.content,
                            modelName=model.name,
                            priceUsd=0.0,
                            processingTime=processingTime,
                            bytesSent=0,
                            bytesReceived=0,
                            errorCount=0
                        )
                    except Exception as e:
                        lastError = e
                        logger.warning(f"❌ Image processing failed with model {model.name}: {str(e)}")

                        if attempt < len(failoverModelList) - 1:
                            logger.info(f"🔄 Trying next fallback model for image processing...")
                            continue
                        else:
                            logger.error(f"💥 All {len(failoverModelList)} models failed for image processing")
                            raise

                # For non-image parts, check if part fits in model context
                partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0

                modelContextTokens = model.contextLength
                modelMaxOutputTokens = model.maxTokens

                promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0
                systemMessageTokens = 10
                outputTokens = modelMaxOutputTokens
                messageOverheadTokens = 100
                totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens

                availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)
                if availableContentTokens < 100:
                    availableContentTokens = max(100, int(modelContextTokens * 0.1))

                availableContentBytes = availableContentTokens * 4

                logger.debug(f"Size check for {model.name}: partSize={partSize} bytes, availableContentBytes={availableContentBytes} bytes")

                if partSize <= availableContentBytes:
                    # Part fits - call AI directly via aiObjects interface
                    response = await aiObjects._callWithModel(model, prompt, contentPart.data, options)
                    logger.info(f"✅ Content part processed successfully with model: {model.name}")
                    return response
                else:
                    # Part too large - chunk it
                    chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
                    if not chunks:
                        raise ValueError(f"Failed to chunk content part for model {model.name}")

                    logger.info(f"Starting to process {len(chunks)} chunks with model {model.name}")

                    if progressCallback:
                        progressCallback(0.0, f"Starting to process {len(chunks)} chunks")

                    chunkResults = []
                    for idx, chunk in enumerate(chunks):
                        chunkNum = idx + 1
                        chunkData = chunk.get('data', '')
                        logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")

                        if progressCallback:
                            progressCallback(chunkNum / len(chunks), f"Processing chunk {chunkNum}/{len(chunks)}")

                        try:
                            chunkResponse = await aiObjects._callWithModel(model, prompt, chunkData, options)
                            chunkResults.append(chunkResponse)
                            logger.info(f"✅ Chunk {chunkNum}/{len(chunks)} processed successfully")

                            if progressCallback:
                                progressCallback(chunkNum / len(chunks), f"Chunk {chunkNum}/{len(chunks)} processed")
                        except Exception as e:
                            logger.error(f"❌ Error processing chunk {chunkNum}/{len(chunks)}: {str(e)}")
                            raise

                    # Merge chunk results
                    mergedContent = self.mergeChunkResults(chunkResults)

                    logger.info(f"✅ Content part chunked and processed with model: {model.name} ({len(chunks)} chunks)")
                    return AiCallResponse(
                        content=mergedContent,
                        modelName=model.name,
                        priceUsd=sum(r.priceUsd for r in chunkResults),
                        processingTime=sum(r.processingTime for r in chunkResults),
                        bytesSent=sum(r.bytesSent for r in chunkResults),
                        bytesReceived=sum(r.bytesReceived for r in chunkResults),
                        errorCount=sum(r.errorCount for r in chunkResults)
                    )

            except Exception as e:
                lastError = e
                error_msg = str(e) if str(e) else f"{type(e).__name__}"
                logger.warning(f"❌ Model {model.name} failed for content part: {error_msg}", exc_info=True)

                if attempt < len(failoverModelList) - 1:
                    logger.info(f"🔄 Trying next failover model...")
                    continue
                else:
                    logger.error(f"💥 All {len(failoverModelList)} models failed for content part")
                    break

        # All models failed
        return self._createErrorResponse(f"All models failed: {str(lastError)}", 0, 0)

    def _createErrorResponse(self, errorMsg: str, inputBytes: int, outputBytes: int) -> AiCallResponse:
        """Create an error response."""
        return AiCallResponse(
            content=errorMsg,
            modelName="error",
            priceUsd=0.0,
            processingTime=0.0,
            bytesSent=inputBytes,
            bytesReceived=outputBytes,
            errorCount=1
        )

    async def processContentPartsWithAi(
        self,
        request: AiCallRequest,
        aiObjects,  # Pass interface for AI calls
        progressCallback=None
    ) -> AiCallResponse:
        """Process content parts with model-aware chunking and AI calls.

        Moved from interfaceAiObjects.callWithContentParts() - entry point for content parts processing.
        """
        prompt = request.prompt
        options = request.options
        contentParts = request.contentParts

        # Get failover models
        availableModels = modelRegistry.getAvailableModels()
        failoverModelList = modelSelector.getFailoverModelList(prompt, "", options, availableModels)

        if not failoverModelList:
            return self._createErrorResponse("No suitable models found", 0, 0)

        # Process each content part
        allResults = []
        for contentPart in contentParts:
            partResult = await self.processContentPartWithFallback(
                contentPart, prompt, options, failoverModelList, aiObjects, progressCallback
            )
            allResults.append(partResult)

        # Merge all results using unified mergePartResults
        mergedContent = self.mergePartResults(allResults)

        return AiCallResponse(
            content=mergedContent,
            modelName="multiple",
            priceUsd=sum(r.priceUsd for r in allResults),
            processingTime=sum(r.processingTime for r in allResults),
            bytesSent=sum(r.bytesSent for r in allResults),
            bytesReceived=sum(r.bytesReceived for r in allResults),
            errorCount=sum(r.errorCount for r in allResults)
        )


# Module-level function for use by subPipeline and ExtractionService
def applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
    """Apply merging strategy to parts with intelligent token-aware merging.

    Moved from interfaceAiObjects.py to resolve dependency violations.
    Can be used as module-level function or called from ExtractionService methods.
    """
    logger.debug(f"applyMerging called with {len(parts)} parts")

    # Import merging dependencies (now local imports ✅)
    from .merging.mergerText import TextMerger
    from .merging.mergerTable import TableMerger
    from .merging.mergerDefault import DefaultMerger
    from .subMerger import IntelligentTokenAwareMerger

    # Check if intelligent merging is enabled
    if strategy.useIntelligentMerging:
        modelCapabilities = strategy.capabilities or {}
        subMerger = IntelligentTokenAwareMerger(modelCapabilities)

        # Use intelligent merging for all parts
        merged = subMerger.mergeChunksIntelligently(parts, strategy.prompt or "")

        # Calculate and log optimization stats
        stats = subMerger.calculateOptimizationStats(parts, merged)
        logger.info(f"🧠 Intelligent merging stats: {stats}")
        logger.debug(f"Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")

        return merged

    # Fallback to traditional merging
    textMerger = TextMerger()
    tableMerger = TableMerger()
    defaultMerger = DefaultMerger()

    # Group by typeGroup
    textParts = [p for p in parts if p.typeGroup == "text"]
    tableParts = [p for p in parts if p.typeGroup == "table"]
    structureParts = [p for p in parts if p.typeGroup == "structure"]
    otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]

    logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")

    merged: List[ContentPart] = []

    if textParts:
        textMerged = textMerger.merge(textParts, strategy)
        logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
        merged.extend(textMerged)
    if tableParts:
        tableMerged = tableMerger.merge(tableParts, strategy)
        logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
        merged.extend(tableMerged)
    if structureParts:
        # For now, treat structure like text
        structureMerged = textMerger.merge(structureParts, strategy)
        logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
        merged.extend(structureMerged)
    if otherParts:
        otherMerged = defaultMerger.merge(otherParts, strategy)
        logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
        merged.extend(otherMerged)

    logger.debug(f"applyMerging returning {len(merged)} parts")
    return merged