gateway/modules/services/serviceExtraction/subPipeline.py

147 lines
5.7 KiB
Python

from typing import Any, Dict, List
import logging
import os
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .merging.mergerText import TextMerger
from .merging.mergerTable import TableMerger
from .merging.mergerDefault import DefaultMerger
from .subMerger import IntelligentTokenAwareMerger
logger = logging.getLogger(__name__)
def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List[ContentPart]:
"""Merge parts based on the provided strategy."""
if not parts or not mergeStrategy:
return parts
groupBy = mergeStrategy.get("groupBy", "typeGroup")
orderBy = mergeStrategy.get("orderBy", "id")
# Group parts by the specified field
groups = {}
for part in parts:
key = getattr(part, groupBy, "unknown")
if key not in groups:
groups[key] = []
groups[key].append(part)
# Merge each group
merged_parts = []
for group_key, group_parts in groups.items():
if len(group_parts) == 1:
merged_parts.extend(group_parts)
else:
# Sort by orderBy field if specified
if orderBy:
group_parts.sort(key=lambda p: getattr(p, orderBy, ""))
# Use appropriate merger based on type
type_group = group_parts[0].typeGroup if group_parts else "unknown"
if type_group == "text":
merger = TextMerger()
elif type_group == "table":
merger = TableMerger()
else:
merger = DefaultMerger()
# Merge the group
merged = merger.merge(group_parts, mergeStrategy)
merged_parts.extend(merged)
return merged_parts
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: Dict[str, Any]) -> ContentExtracted:
extractor = extractorRegistry.resolve(mimeType, fileName)
if extractor is None:
# fallback: single binary part
part = ContentPart(
id=makeId(),
parentId=None,
label="file",
typeGroup="binary",
mimeType=mimeType or "application/octet-stream",
data="",
metadata={"warning": "No extractor registered"}
)
return ContentExtracted(id=makeId(), parts=[part])
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options})
# REMOVED: poolAndLimit(parts, chunkerRegistry, options)
# REMOVED: Chunking logic - now handled in AI call phase
# Apply merging strategy if provided (preserve existing logic)
mergeStrategy = options.get("mergeStrategy", {})
if mergeStrategy:
parts = _applyMerging(parts, mergeStrategy)
return ContentExtracted(id=makeId(), parts=parts)
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
"""Apply merging strategy to parts with intelligent token-aware merging."""
logger.debug(f"_applyMerging called with {len(parts)} parts")
# Check if intelligent merging is enabled
if strategy.get("useIntelligentMerging", False):
model_capabilities = strategy.get("capabilities", {})
subMerger = IntelligentTokenAwareMerger(model_capabilities)
# Use intelligent merging for all parts
merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
# Calculate and log optimization stats
stats = subMerger.calculate_optimization_stats(parts, merged)
logger.info(f"🧠 Intelligent merging stats: {stats}")
logger.debug(f"Intelligent merging: {stats['original_ai_calls']}{stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
return merged
# Fallback to traditional merging
textMerger = TextMerger()
tableMerger = TableMerger()
defaultMerger = DefaultMerger()
# Group by typeGroup
textParts = [p for p in parts if p.typeGroup == "text"]
tableParts = [p for p in parts if p.typeGroup == "table"]
structureParts = [p for p in parts if p.typeGroup == "structure"]
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
merged: List[ContentPart] = []
if textParts:
textMerged = textMerger.merge(textParts, strategy)
logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
merged.extend(textMerged)
if tableParts:
tableMerged = tableMerger.merge(tableParts, strategy)
logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
merged.extend(tableMerged)
if structureParts:
# For now, treat structure like text
structureMerged = textMerger.merge(structureParts, strategy)
logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
merged.extend(structureMerged)
if otherParts:
otherMerged = defaultMerger.merge(otherParts, strategy)
logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
merged.extend(otherMerged)
logger.debug(f"_applyMerging returning {len(merged)} parts")
return merged
# REMOVED: _applySizeLimit function - no longer needed after removing poolAndLimit