gateway/modules/services/serviceExtraction/subPipeline.py

from typing import Any, Dict, List
import logging
import os

from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .merging.mergerText import TextMerger
from .merging.mergerTable import TableMerger
from .merging.mergerDefault import DefaultMerger
from .subMerger import IntelligentTokenAwareMerger

logger = logging.getLogger(__name__)


def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List[ContentPart]:
    """Merge parts based on the provided strategy."""
    if not parts or not mergeStrategy:
        return parts

    groupBy = mergeStrategy.get("groupBy", "typeGroup")
    orderBy = mergeStrategy.get("orderBy", "id")

    # Group parts by the specified field
    groups = {}
    for part in parts:
        key = getattr(part, groupBy, "unknown")
        if key not in groups:
            groups[key] = []
        groups[key].append(part)

    # Merge each group
    merged_parts = []
    for group_key, group_parts in groups.items():
        if len(group_parts) == 1:
            merged_parts.extend(group_parts)
        else:
            # Sort by orderBy field if specified
            if orderBy:
                group_parts.sort(key=lambda p: getattr(p, orderBy, ""))

            # Use appropriate merger based on type
            type_group = group_parts[0].typeGroup if group_parts else "unknown"

            if type_group == "text":
                merger = TextMerger()
            elif type_group == "table":
                merger = TableMerger()
            else:
                merger = DefaultMerger()

            # Merge the group
            merged = merger.merge(group_parts, mergeStrategy)
            merged_parts.extend(merged)

    return merged_parts


def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: Dict[str, Any]) -> ContentExtracted:
    extractor = extractorRegistry.resolve(mimeType, fileName)
    if extractor is None:
        # fallback: single binary part
        part = ContentPart(
            id=makeId(),
            parentId=None,
            label="file",
            typeGroup="binary",
            mimeType=mimeType or "application/octet-stream",
            data="",
            metadata={"warning": "No extractor registered"}
        )
        return ContentExtracted(id=makeId(), parts=[part])

    parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options})

    # Apply chunking and size limiting
    parts = poolAndLimit(parts, chunkerRegistry, options)

    # Optional merge step - but preserve chunks
    mergeStrategy = options.get("mergeStrategy", {})
    if mergeStrategy:

        # Don't merge chunks - they should stay separate for processing
        non_chunk_parts = [p for p in parts if not p.metadata.get("chunk", False)]
        chunk_parts = [p for p in parts if p.metadata.get("chunk", False)]

        logger.debug(f"runExtraction: Preserving {len(chunk_parts)} chunks from merging")
        print(f"🔍 DEBUG: runExtraction - non_chunk_parts: {len(non_chunk_parts)}, chunk_parts: {len(chunk_parts)}")

    # Apply intelligent merging for small text parts
    if non_chunk_parts:
        # Count text parts
        text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"]
        if len(text_parts) > 5:  # If we have many small text parts, merge them
            logger.info(f"🔧 Merging {len(text_parts)} small text parts for efficiency")
            non_chunk_parts = _mergeParts(non_chunk_parts, mergeStrategy)

        # Combine non-chunk parts with chunk parts (chunks stay separate)
        parts = non_chunk_parts + chunk_parts

        logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})")
        print(f"🔍 DEBUG: runExtraction - Final parts: {len(parts)} (chunks: {len(chunk_parts)})")
    # DEBUG: dump parts and chunks to files  TODO TO REMOVE
    try:
        base_dir = "./test-chat/ai"
        os.makedirs(base_dir, exist_ok=True)

        # Generate timestamp for consistent naming
        from datetime import datetime, UTC
        ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3]

        # Write a summary file
        summary_lines: List[str] = [f"fileName: {fileName}", f"mimeType: {mimeType}", f"totalParts: {len(parts)}"]
        text_index = 0
        for idx, part in enumerate(parts):
            is_texty = part.typeGroup in ("text", "table", "structure")
            size = int(part.metadata.get("size", 0) or 0)
            is_chunk = bool(part.metadata.get("chunk", False))
            summary_lines.append(
                f"part[{idx}]: typeGroup={part.typeGroup}, label={part.label}, size={size}, chunk={is_chunk}"
            )
            if is_texty and getattr(part, "data", None):
                text_index += 1
                fname = f"{ts}_extract_{fileName}_part_{idx:03d}_{'chunk' if is_chunk else 'full'}_{text_index:03d}.txt"
                fpath = os.path.join(base_dir, fname)
                with open(fpath, "w", encoding="utf-8") as f:
                    f.write(f"# typeGroup: {part.typeGroup}\n# label: {part.label}\n# chunk: {is_chunk}\n# size: {size}\n\n")
                    f.write(str(part.data))

        # Write summary file
        summary_fname = f"{ts}_extract_{fileName}_summary.txt"
        summary_fpath = os.path.join(base_dir, summary_fname)
        with open(summary_fpath, "w", encoding="utf-8") as f:
            f.write("\n".join(summary_lines))
    except Exception as _e:
        logger.debug(f"Debug dump skipped: {_e}")

    return ContentExtracted(id=makeId(), parts=parts)


def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, options: Dict[str, Any]) -> List[ContentPart]:
    maxSize = int(options.get("maxSize", 0) or 0)
    chunkAllowed = bool(options.get("chunkAllowed", False))
    mergeStrategy = options.get("mergeStrategy", {})

    if maxSize <= 0:
        # Still apply merging if strategy provided
        if mergeStrategy:
            return _applyMerging(parts, mergeStrategy)
        return parts

    # First, try to fit within size limit
    current = 0
    kept: List[ContentPart] = []
    remaining: List[ContentPart] = []

    print(f"🔍 DEBUG: Starting poolAndLimit with {len(parts)} parts, maxSize={maxSize}")

    for i, p in enumerate(parts):
        size = int(p.metadata.get("size", 0) or 0)
        # Show first 50 characters of text content for debugging
        content_preview = p.data[:50].replace('\n', '\\n') if p.data else ""
        print(f"🔍 DEBUG: Part {i}: {p.typeGroup} - {size} bytes - '{content_preview}...' (current: {current})")
        if current + size <= maxSize:
            kept.append(p)
            current += size
            print(f"🔍 DEBUG: Part {i} kept (total: {current})")
        else:
            remaining.append(p)
            print(f"🔍 DEBUG: Part {i} moved to remaining")

    print(f"🔍 DEBUG: Kept: {len(kept)}, Remaining: {len(remaining)}")

    # If we have remaining parts and chunking is allowed, try chunking
    if remaining and chunkAllowed:
        logger.debug(f"=== CHUNKING ACTIVATED ===")
        logger.debug(f"Remaining parts to chunk: {len(remaining)}")
        logger.debug(f"Max size limit: {maxSize} bytes")
        logger.debug(f"Current size used: {current} bytes")
        print(f"🔍 DEBUG: Chunking {len(remaining)} remaining parts")

        for p in remaining:
            if p.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
                logger.debug(f"Chunking {p.typeGroup} part: {len(p.data)} chars")
                print(f"🔍 DEBUG: Chunking {p.typeGroup} part with {len(p.data)} chars")
                chunks = chunkerRegistry.resolve(p.typeGroup).chunk(p, options)
                logger.debug(f"Created {len(chunks)} chunks")
                print(f"🔍 DEBUG: Created {len(chunks)} chunks")

                chunks_added = 0
                for ch in chunks:
                    chSize = int(ch.get("size", 0) or 0)
                    # Add all chunks - don't limit by maxSize since they'll be processed separately
                    kept.append(ContentPart(
                        id=makeId(),
                        parentId=p.id,
                        label=f"chunk_{ch.get('order', 0)}",
                        typeGroup=p.typeGroup,
                        mimeType=p.mimeType,
                        data=ch.get("data", ""),
                        metadata={
                            "size": chSize,
                            "chunk": True,
                            **ch.get("metadata", {})
                        }
                    ))
                    chunks_added += 1
                    logger.debug(f"Added chunk {ch.get('order', 0)}: {chSize} bytes")

                logger.debug(f"Added {chunks_added} chunks from {p.typeGroup} part")

    # Apply merging strategy if provided, but preserve chunks
    if mergeStrategy:
        # Don't merge chunks - they should stay separate for processing
        non_chunk_parts = [p for p in kept if not p.metadata.get("chunk", False)]
        chunk_parts = [p for p in kept if p.metadata.get("chunk", False)]

        logger.debug(f"Preserving {len(chunk_parts)} chunks from merging")

    # Apply intelligent merging for small text parts
    if non_chunk_parts:
        # Count text parts
        text_parts = [p for p in non_chunk_parts if p.typeGroup == "text"]
        if len(text_parts) > 5:  # If we have many small text parts, merge them
            logger.info(f"🔧 Merging {len(text_parts)} small text parts for efficiency")
            non_chunk_parts = _applyMerging(non_chunk_parts, mergeStrategy)

        # Combine non-chunk parts with chunk parts (chunks stay separate)
        kept = non_chunk_parts + chunk_parts

        logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")
        print(f"🔍 DEBUG: Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")

        # Re-check size after merging
        totalSize = sum(int(p.metadata.get("size", 0) or 0) for p in kept)
        if totalSize > maxSize and mergeStrategy.get("maxSize"):
            # Apply size limit to merged parts
            kept = _applySizeLimit(kept, maxSize)

    print(f"🔍 DEBUG: poolAndLimit returning {len(kept)} parts")
    return kept


def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
    """Apply merging strategy to parts with intelligent token-aware merging."""
    print(f"🔍 DEBUG: _applyMerging called with {len(parts)} parts")

    # Check if intelligent merging is enabled
    if strategy.get("useIntelligentMerging", False):
        model_capabilities = strategy.get("modelCapabilities", {})
        subMerger = IntelligentTokenAwareMerger(model_capabilities)

        # Use intelligent merging for all parts
        merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))

        # Calculate and log optimization stats
        stats = subMerger.calculate_optimization_stats(parts, merged)
        logger.info(f"🧠 Intelligent merging stats: {stats}")
        print(f"🔍 DEBUG: Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")

        return merged

    # Fallback to traditional merging
    textMerger = TextMerger()
    tableMerger = TableMerger()
    defaultMerger = DefaultMerger()

    # Group by typeGroup
    textParts = [p for p in parts if p.typeGroup == "text"]
    tableParts = [p for p in parts if p.typeGroup == "table"]
    structureParts = [p for p in parts if p.typeGroup == "structure"]
    otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]

    print(f"🔍 DEBUG: Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")

    merged: List[ContentPart] = []

    if textParts:
        textMerged = textMerger.merge(textParts, strategy)
        print(f"🔍 DEBUG: TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
        merged.extend(textMerged)
    if tableParts:
        tableMerged = tableMerger.merge(tableParts, strategy)
        print(f"🔍 DEBUG: TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
        merged.extend(tableMerged)
    if structureParts:
        # For now, treat structure like text
        structureMerged = textMerger.merge(structureParts, strategy)
        print(f"🔍 DEBUG: StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
        merged.extend(structureMerged)
    if otherParts:
        otherMerged = defaultMerger.merge(otherParts, strategy)
        print(f"🔍 DEBUG: DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
        merged.extend(otherMerged)

    print(f"🔍 DEBUG: _applyMerging returning {len(merged)} parts")
    return merged


def _applySizeLimit(parts: List[ContentPart], maxSize: int) -> List[ContentPart]:
    """Apply size limit by prioritizing parts and truncating if necessary."""
    # Sort by priority: text first, then others
    priority_order = {"text": 0, "table": 1, "structure": 2, "image": 3, "binary": 4, "metadata": 5, "container": 6}
    sorted_parts = sorted(parts, key=lambda p: priority_order.get(p.typeGroup, 99))

    kept: List[ContentPart] = []
    current_size = 0

    for part in sorted_parts:
        part_size = int(part.metadata.get("size", 0) or 0)
        if current_size + part_size <= maxSize:
            kept.append(part)
            current_size += part_size
        else:
            # Try to truncate text parts
            if part.typeGroup == "text" and part_size > 0:
                remaining_size = maxSize - current_size
                if remaining_size > 1000:  # Only truncate if we have meaningful space
                    truncated_data = part.data[:remaining_size * 4]  # Rough character estimate
                    truncated_part = ContentPart(
                        id=makeId(),
                        parentId=part.parentId,
                        label=f"{part.label}_truncated",
                        typeGroup=part.typeGroup,
                        mimeType=part.mimeType,
                        data=truncated_data,
                        metadata={**part.metadata, "size": len(truncated_data.encode('utf-8')), "truncated": True}
                    )
                    kept.append(truncated_part)
            break

    return kept


def applyAiIfRequested(extracted: ContentExtracted, options: Dict[str, Any]) -> ContentExtracted:
    """
    Apply AI processing if requested in options.
    This is a placeholder for actual AI integration.
    """
    prompt = options.get("prompt")
    operationType = options.get("operationType", "general")

    if not prompt:
        return extracted

    # Placeholder AI processing based on operationType
    if operationType == "analyse_content":
        # Add analysis metadata to parts
        for part in extracted.parts:
            if part.typeGroup in ("text", "table", "structure"):
                part.metadata["ai_processed"] = True
                part.metadata["operation_type"] = operationType
    elif operationType == "generate_plan":
        # Add plan generation metadata
        for part in extracted.parts:
            if part.typeGroup == "text":
                part.metadata["ai_processed"] = True
                part.metadata["operation_type"] = operationType
    elif operationType == "generate_content":
        # Add content generation metadata
        for part in extracted.parts:
            part.metadata["ai_processed"] = True
            part.metadata["operation_type"] = operationType

    return extracted