refactored plug and play document engine

This commit is contained in:
ValueOn AG 2025-10-12 01:14:07 +02:00
parent 5537d3e704
commit 0e79024b07
23 changed files with 23 additions and 23 deletions

View file

@ -5,10 +5,10 @@ import os
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .merging.text_merger import TextMerger
from .merging.table_merger import TableMerger
from .merging.default_merger import DefaultMerger
from .intelligent_merger import IntelligentTokenAwareMerger
from .merging.mergerText import TextMerger
from .merging.mergerTable import TableMerger
from .merging.mergerDefault import DefaultMerger
from .subMerger import IntelligentTokenAwareMerger
logger = logging.getLogger(__name__)
@ -248,13 +248,13 @@ def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[Co
# Check if intelligent merging is enabled
if strategy.get("useIntelligentMerging", False):
model_capabilities = strategy.get("modelCapabilities", {})
intelligent_merger = IntelligentTokenAwareMerger(model_capabilities)
subMerger = IntelligentTokenAwareMerger(model_capabilities)
# Use intelligent merging for all parts
merged = intelligent_merger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
# Calculate and log optimization stats
stats = intelligent_merger.calculate_optimization_stats(parts, merged)
stats = subMerger.calculate_optimization_stats(parts, merged)
logger.info(f"🧠 Intelligent merging stats: {stats}")
print(f"🔍 DEBUG: Intelligent merging: {stats['original_ai_calls']}{stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")

View file

@ -22,17 +22,17 @@ class ExtractorRegistry:
self._fallback: Optional[Extractor] = None
# Register built-ins
try:
from .formats.text_extractor import TextExtractor
from .formats.csv_extractor import CsvExtractor
from .formats.json_extractor import JsonExtractor
from .formats.xml_extractor import XmlExtractor
from .formats.html_extractor import HtmlExtractor
from .formats.pdf_extractor import PdfExtractor
from .formats.docx_extractor import DocxExtractor
from .formats.xlsx_extractor import XlsxExtractor
from .formats.pptx_extractor import PptxExtractor
from .formats.image_extractor import ImageExtractor
from .formats.binary_extractor import BinaryExtractor
from .extractors.extractorText import TextExtractor
from .extractors.extractorCsv import CsvExtractor
from .extractors.extractorJson import JsonExtractor
from .extractors.extractorXml import XmlExtractor
from .extractors.extractorHtml import HtmlExtractor
from .extractors.extractorPdf import PdfExtractor
from .extractors.extractorDocx import DocxExtractor
from .extractors.extractorXlsx import XlsxExtractor
from .extractors.extractorPptx import PptxExtractor
from .extractors.extractorImage import ImageExtractor
from .extractors.extractorBinary import BinaryExtractor
self.register("text/plain", TextExtractor())
self.register("text/markdown", TextExtractor())
self.register("text/csv", CsvExtractor())
@ -93,10 +93,10 @@ class ChunkerRegistry:
self._noop = Chunker()
# Register default chunkers
try:
from .chunking.text_chunker import TextChunker
from .chunking.table_chunker import TableChunker
from .chunking.structure_chunker import StructureChunker
from .chunking.image_chunker import ImageChunker
from .chunking.chunkerText import TextChunker
from .chunking.chunkerTable import TableChunker
from .chunking.chunkerStructure import StructureChunker
from .chunking.chunkerImage import ImageChunker
self.register("text", TextChunker())
self.register("table", TableChunker())
self.register("structure", StructureChunker())

View file

@ -17,7 +17,7 @@ try:
except ImportError:
OPENPYXL_AVAILABLE = False
class RendererExcel(BaseRenderer):
class RendererXlsx(BaseRenderer):
"""Renders content to Excel format using openpyxl."""
@classmethod