147 lines
5.7 KiB
Python
147 lines
5.7 KiB
Python
from typing import Any, Dict, List
|
|
import logging
|
|
import os
|
|
|
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
|
|
from .subUtils import makeId
|
|
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
|
from .merging.mergerText import TextMerger
|
|
from .merging.mergerTable import TableMerger
|
|
from .merging.mergerDefault import DefaultMerger
|
|
from .subMerger import IntelligentTokenAwareMerger
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List[ContentPart]:
|
|
"""Merge parts based on the provided strategy."""
|
|
if not parts or not mergeStrategy:
|
|
return parts
|
|
|
|
groupBy = mergeStrategy.get("groupBy", "typeGroup")
|
|
orderBy = mergeStrategy.get("orderBy", "id")
|
|
|
|
# Group parts by the specified field
|
|
groups = {}
|
|
for part in parts:
|
|
key = getattr(part, groupBy, "unknown")
|
|
if key not in groups:
|
|
groups[key] = []
|
|
groups[key].append(part)
|
|
|
|
# Merge each group
|
|
merged_parts = []
|
|
for group_key, group_parts in groups.items():
|
|
if len(group_parts) == 1:
|
|
merged_parts.extend(group_parts)
|
|
else:
|
|
# Sort by orderBy field if specified
|
|
if orderBy:
|
|
group_parts.sort(key=lambda p: getattr(p, orderBy, ""))
|
|
|
|
# Use appropriate merger based on type
|
|
type_group = group_parts[0].typeGroup if group_parts else "unknown"
|
|
|
|
if type_group == "text":
|
|
merger = TextMerger()
|
|
elif type_group == "table":
|
|
merger = TableMerger()
|
|
else:
|
|
merger = DefaultMerger()
|
|
|
|
# Merge the group
|
|
merged = merger.merge(group_parts, mergeStrategy)
|
|
merged_parts.extend(merged)
|
|
|
|
return merged_parts
|
|
|
|
|
|
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: Dict[str, Any]) -> ContentExtracted:
|
|
extractor = extractorRegistry.resolve(mimeType, fileName)
|
|
if extractor is None:
|
|
# fallback: single binary part
|
|
part = ContentPart(
|
|
id=makeId(),
|
|
parentId=None,
|
|
label="file",
|
|
typeGroup="binary",
|
|
mimeType=mimeType or "application/octet-stream",
|
|
data="",
|
|
metadata={"warning": "No extractor registered"}
|
|
)
|
|
return ContentExtracted(id=makeId(), parts=[part])
|
|
|
|
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options})
|
|
|
|
# REMOVED: poolAndLimit(parts, chunkerRegistry, options)
|
|
# REMOVED: Chunking logic - now handled in AI call phase
|
|
|
|
# Apply merging strategy if provided (preserve existing logic)
|
|
mergeStrategy = options.get("mergeStrategy", {})
|
|
if mergeStrategy:
|
|
parts = _applyMerging(parts, mergeStrategy)
|
|
|
|
return ContentExtracted(id=makeId(), parts=parts)
|
|
|
|
|
|
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
|
|
|
|
|
|
def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[ContentPart]:
|
|
"""Apply merging strategy to parts with intelligent token-aware merging."""
|
|
logger.debug(f"_applyMerging called with {len(parts)} parts")
|
|
|
|
# Check if intelligent merging is enabled
|
|
if strategy.get("useIntelligentMerging", False):
|
|
model_capabilities = strategy.get("capabilities", {})
|
|
subMerger = IntelligentTokenAwareMerger(model_capabilities)
|
|
|
|
# Use intelligent merging for all parts
|
|
merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
|
|
|
|
# Calculate and log optimization stats
|
|
stats = subMerger.calculate_optimization_stats(parts, merged)
|
|
logger.info(f"🧠 Intelligent merging stats: {stats}")
|
|
logger.debug(f"Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
|
|
|
|
return merged
|
|
|
|
# Fallback to traditional merging
|
|
textMerger = TextMerger()
|
|
tableMerger = TableMerger()
|
|
defaultMerger = DefaultMerger()
|
|
|
|
# Group by typeGroup
|
|
textParts = [p for p in parts if p.typeGroup == "text"]
|
|
tableParts = [p for p in parts if p.typeGroup == "table"]
|
|
structureParts = [p for p in parts if p.typeGroup == "structure"]
|
|
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
|
|
|
|
logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
|
|
|
|
merged: List[ContentPart] = []
|
|
|
|
if textParts:
|
|
textMerged = textMerger.merge(textParts, strategy)
|
|
logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
|
|
merged.extend(textMerged)
|
|
if tableParts:
|
|
tableMerged = tableMerger.merge(tableParts, strategy)
|
|
logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
|
|
merged.extend(tableMerged)
|
|
if structureParts:
|
|
# For now, treat structure like text
|
|
structureMerged = textMerger.merge(structureParts, strategy)
|
|
logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
|
|
merged.extend(structureMerged)
|
|
if otherParts:
|
|
otherMerged = defaultMerger.merge(otherParts, strategy)
|
|
logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
|
|
merged.extend(otherMerged)
|
|
|
|
logger.debug(f"_applyMerging returning {len(merged)} parts")
|
|
return merged
|
|
|
|
|
|
# REMOVED: _applySizeLimit function - no longer needed after removing poolAndLimit
|
|
|