from typing import List import logging from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy from .subUtils import makeId from .subRegistry import ExtractorRegistry, ChunkerRegistry from .merging.mergerText import TextMerger from .merging.mergerTable import TableMerger from .merging.mergerDefault import DefaultMerger from .subMerger import IntelligentTokenAwareMerger logger = logging.getLogger(__name__) def _mergeParts(parts: List[ContentPart], mergeStrategy: MergeStrategy) -> List[ContentPart]: """Merge parts based on the provided strategy.""" if not parts or not mergeStrategy: return parts groupBy = mergeStrategy.groupBy orderBy = mergeStrategy.orderBy # Group parts by the specified field groups = {} for part in parts: key = getattr(part, groupBy, "unknown") if key not in groups: groups[key] = [] groups[key].append(part) # Merge each group merged_parts = [] for group_key, group_parts in groups.items(): if len(group_parts) == 1: merged_parts.extend(group_parts) else: # Sort by orderBy field if specified if orderBy: group_parts.sort(key=lambda p: getattr(p, orderBy, "")) # Use appropriate merger based on type type_group = group_parts[0].typeGroup if group_parts else "unknown" if type_group == "text": merger = TextMerger() elif type_group == "table": merger = TableMerger() else: merger = DefaultMerger() # Merge the group merged = merger.merge(group_parts, mergeStrategy) merged_parts.extend(merged) return merged_parts def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted: extractor = extractorRegistry.resolve(mimeType, fileName) if extractor is None: # fallback: single binary part part = ContentPart( id=makeId(), parentId=None, label="file", typeGroup="binary", mimeType=mimeType or "application/octet-stream", data="", metadata={"warning": "No extractor registered"} ) return ContentExtracted(id=makeId(), parts=[part]) parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType}) # REMOVED: poolAndLimit(parts, chunkerRegistry, options) # REMOVED: Chunking logic - now handled in AI call phase # Apply merging strategy if provided (preserve existing logic) if options.mergeStrategy: parts = _applyMerging(parts, options.mergeStrategy) return ContentExtracted(id=makeId(), parts=parts) # REMOVED: poolAndLimit function - chunking now handled in AI call phase def _applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]: """Apply merging strategy to parts with intelligent token-aware merging.""" logger.debug(f"_applyMerging called with {len(parts)} parts") # Check if intelligent merging is enabled if strategy.useIntelligentMerging: model_capabilities = strategy.capabilities or {} subMerger = IntelligentTokenAwareMerger(model_capabilities) # Use intelligent merging for all parts merged = subMerger.merge_chunks_intelligently(parts, strategy.prompt or "") # Calculate and log optimization stats stats = subMerger.calculate_optimization_stats(parts, merged) logger.info(f"🧠 Intelligent merging stats: {stats}") logger.debug(f"Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)") return merged # Fallback to traditional merging textMerger = TextMerger() tableMerger = TableMerger() defaultMerger = DefaultMerger() # Group by typeGroup textParts = [p for p in parts if p.typeGroup == "text"] tableParts = [p for p in parts if p.typeGroup == "table"] structureParts = [p for p in parts if p.typeGroup == "structure"] otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")] logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}") merged: List[ContentPart] = [] if textParts: textMerged = textMerger.merge(textParts, strategy) logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts") merged.extend(textMerged) if tableParts: tableMerged = tableMerger.merge(tableParts, strategy) logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts") merged.extend(tableMerged) if structureParts: # For now, treat structure like text structureMerged = textMerger.merge(structureParts, strategy) logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts") merged.extend(structureMerged) if otherParts: otherMerged = defaultMerger.merge(otherParts, strategy) logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts") merged.extend(otherMerged) logger.debug(f"_applyMerging returning {len(merged)} parts") return merged # REMOVED: _applySizeLimit function - no longer needed after removing poolAndLimit