refactored plug and play document engine

This commit is contained in:
ValueOn AG 2025-10-12 01:14:07 +02:00
parent 5537d3e704
commit 0e79024b07
23 changed files with 23 additions and 23 deletions

View file

@ -5,10 +5,10 @@ import os
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
from .subUtils import makeId from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .merging.text_merger import TextMerger from .merging.mergerText import TextMerger
from .merging.table_merger import TableMerger from .merging.mergerTable import TableMerger
from .merging.default_merger import DefaultMerger from .merging.mergerDefault import DefaultMerger
from .intelligent_merger import IntelligentTokenAwareMerger from .subMerger import IntelligentTokenAwareMerger
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -248,13 +248,13 @@ def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[Co
# Check if intelligent merging is enabled # Check if intelligent merging is enabled
if strategy.get("useIntelligentMerging", False): if strategy.get("useIntelligentMerging", False):
model_capabilities = strategy.get("modelCapabilities", {}) model_capabilities = strategy.get("modelCapabilities", {})
intelligent_merger = IntelligentTokenAwareMerger(model_capabilities) subMerger = IntelligentTokenAwareMerger(model_capabilities)
# Use intelligent merging for all parts # Use intelligent merging for all parts
merged = intelligent_merger.merge_chunks_intelligently(parts, strategy.get("prompt", "")) merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
# Calculate and log optimization stats # Calculate and log optimization stats
stats = intelligent_merger.calculate_optimization_stats(parts, merged) stats = subMerger.calculate_optimization_stats(parts, merged)
logger.info(f"🧠 Intelligent merging stats: {stats}") logger.info(f"🧠 Intelligent merging stats: {stats}")
print(f"🔍 DEBUG: Intelligent merging: {stats['original_ai_calls']}{stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)") print(f"🔍 DEBUG: Intelligent merging: {stats['original_ai_calls']}{stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")

View file

@ -22,17 +22,17 @@ class ExtractorRegistry:
self._fallback: Optional[Extractor] = None self._fallback: Optional[Extractor] = None
# Register built-ins # Register built-ins
try: try:
from .formats.text_extractor import TextExtractor from .extractors.extractorText import TextExtractor
from .formats.csv_extractor import CsvExtractor from .extractors.extractorCsv import CsvExtractor
from .formats.json_extractor import JsonExtractor from .extractors.extractorJson import JsonExtractor
from .formats.xml_extractor import XmlExtractor from .extractors.extractorXml import XmlExtractor
from .formats.html_extractor import HtmlExtractor from .extractors.extractorHtml import HtmlExtractor
from .formats.pdf_extractor import PdfExtractor from .extractors.extractorPdf import PdfExtractor
from .formats.docx_extractor import DocxExtractor from .extractors.extractorDocx import DocxExtractor
from .formats.xlsx_extractor import XlsxExtractor from .extractors.extractorXlsx import XlsxExtractor
from .formats.pptx_extractor import PptxExtractor from .extractors.extractorPptx import PptxExtractor
from .formats.image_extractor import ImageExtractor from .extractors.extractorImage import ImageExtractor
from .formats.binary_extractor import BinaryExtractor from .extractors.extractorBinary import BinaryExtractor
self.register("text/plain", TextExtractor()) self.register("text/plain", TextExtractor())
self.register("text/markdown", TextExtractor()) self.register("text/markdown", TextExtractor())
self.register("text/csv", CsvExtractor()) self.register("text/csv", CsvExtractor())
@ -93,10 +93,10 @@ class ChunkerRegistry:
self._noop = Chunker() self._noop = Chunker()
# Register default chunkers # Register default chunkers
try: try:
from .chunking.text_chunker import TextChunker from .chunking.chunkerText import TextChunker
from .chunking.table_chunker import TableChunker from .chunking.chunkerTable import TableChunker
from .chunking.structure_chunker import StructureChunker from .chunking.chunkerStructure import StructureChunker
from .chunking.image_chunker import ImageChunker from .chunking.chunkerImage import ImageChunker
self.register("text", TextChunker()) self.register("text", TextChunker())
self.register("table", TableChunker()) self.register("table", TableChunker())
self.register("structure", StructureChunker()) self.register("structure", StructureChunker())

View file

@ -17,7 +17,7 @@ try:
except ImportError: except ImportError:
OPENPYXL_AVAILABLE = False OPENPYXL_AVAILABLE = False
class RendererExcel(BaseRenderer): class RendererXlsx(BaseRenderer):
"""Renders content to Excel format using openpyxl.""" """Renders content to Excel format using openpyxl."""
@classmethod @classmethod