From 0e79024b07295e728fa9f458b6459126cb8fee8a Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Sun, 12 Oct 2025 01:14:07 +0200
Subject: [PATCH] refactored plug and play document engine
---
.../{image_chunker.py => chunkerImage.py} | 0
...ructure_chunker.py => chunkerStructure.py} | 0
.../{table_chunker.py => chunkerTable.py} | 0
.../{text_chunker.py => chunkerText.py} | 0
.../{formats => extractors}/__init__.py | 0
.../extractorBinary.py} | 0
.../extractorCsv.py} | 0
.../extractorDocx.py} | 0
.../extractorHtml.py} | 0
.../extractorImage.py} | 0
.../extractorJson.py} | 0
.../extractorPdf.py} | 0
.../extractorPptx.py} | 0
.../extractorText.py} | 0
.../extractorXlsx.py} | 0
.../extractorXml.py} | 0
.../{default_merger.py => mergerDefault.py} | 0
.../{table_merger.py => mergerTable.py} | 0
.../merging/{text_merger.py => mergerText.py} | 0
.../{intelligent_merger.py => subMerger.py} | 0
.../services/serviceExtraction/subPipeline.py | 14 ++++-----
.../services/serviceExtraction/subRegistry.py | 30 +++++++++----------
.../{rendererExcel.py => rendererXlsx.py} | 2 +-
23 files changed, 23 insertions(+), 23 deletions(-)
rename modules/services/serviceExtraction/chunking/{image_chunker.py => chunkerImage.py} (100%)
rename modules/services/serviceExtraction/chunking/{structure_chunker.py => chunkerStructure.py} (100%)
rename modules/services/serviceExtraction/chunking/{table_chunker.py => chunkerTable.py} (100%)
rename modules/services/serviceExtraction/chunking/{text_chunker.py => chunkerText.py} (100%)
rename modules/services/serviceExtraction/{formats => extractors}/__init__.py (100%)
rename modules/services/serviceExtraction/{formats/binary_extractor.py => extractors/extractorBinary.py} (100%)
rename modules/services/serviceExtraction/{formats/csv_extractor.py => extractors/extractorCsv.py} (100%)
rename modules/services/serviceExtraction/{formats/docx_extractor.py => extractors/extractorDocx.py} (100%)
rename modules/services/serviceExtraction/{formats/html_extractor.py => extractors/extractorHtml.py} (100%)
rename modules/services/serviceExtraction/{formats/image_extractor.py => extractors/extractorImage.py} (100%)
rename modules/services/serviceExtraction/{formats/json_extractor.py => extractors/extractorJson.py} (100%)
rename modules/services/serviceExtraction/{formats/pdf_extractor.py => extractors/extractorPdf.py} (100%)
rename modules/services/serviceExtraction/{formats/pptx_extractor.py => extractors/extractorPptx.py} (100%)
rename modules/services/serviceExtraction/{formats/text_extractor.py => extractors/extractorText.py} (100%)
rename modules/services/serviceExtraction/{formats/xlsx_extractor.py => extractors/extractorXlsx.py} (100%)
rename modules/services/serviceExtraction/{formats/xml_extractor.py => extractors/extractorXml.py} (100%)
rename modules/services/serviceExtraction/merging/{default_merger.py => mergerDefault.py} (100%)
rename modules/services/serviceExtraction/merging/{table_merger.py => mergerTable.py} (100%)
rename modules/services/serviceExtraction/merging/{text_merger.py => mergerText.py} (100%)
rename modules/services/serviceExtraction/{intelligent_merger.py => subMerger.py} (100%)
rename modules/services/serviceGeneration/renderers/{rendererExcel.py => rendererXlsx.py} (99%)
diff --git a/modules/services/serviceExtraction/chunking/image_chunker.py b/modules/services/serviceExtraction/chunking/chunkerImage.py
similarity index 100%
rename from modules/services/serviceExtraction/chunking/image_chunker.py
rename to modules/services/serviceExtraction/chunking/chunkerImage.py
diff --git a/modules/services/serviceExtraction/chunking/structure_chunker.py b/modules/services/serviceExtraction/chunking/chunkerStructure.py
similarity index 100%
rename from modules/services/serviceExtraction/chunking/structure_chunker.py
rename to modules/services/serviceExtraction/chunking/chunkerStructure.py
diff --git a/modules/services/serviceExtraction/chunking/table_chunker.py b/modules/services/serviceExtraction/chunking/chunkerTable.py
similarity index 100%
rename from modules/services/serviceExtraction/chunking/table_chunker.py
rename to modules/services/serviceExtraction/chunking/chunkerTable.py
diff --git a/modules/services/serviceExtraction/chunking/text_chunker.py b/modules/services/serviceExtraction/chunking/chunkerText.py
similarity index 100%
rename from modules/services/serviceExtraction/chunking/text_chunker.py
rename to modules/services/serviceExtraction/chunking/chunkerText.py
diff --git a/modules/services/serviceExtraction/formats/__init__.py b/modules/services/serviceExtraction/extractors/__init__.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/__init__.py
rename to modules/services/serviceExtraction/extractors/__init__.py
diff --git a/modules/services/serviceExtraction/formats/binary_extractor.py b/modules/services/serviceExtraction/extractors/extractorBinary.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/binary_extractor.py
rename to modules/services/serviceExtraction/extractors/extractorBinary.py
diff --git a/modules/services/serviceExtraction/formats/csv_extractor.py b/modules/services/serviceExtraction/extractors/extractorCsv.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/csv_extractor.py
rename to modules/services/serviceExtraction/extractors/extractorCsv.py
diff --git a/modules/services/serviceExtraction/formats/docx_extractor.py b/modules/services/serviceExtraction/extractors/extractorDocx.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/docx_extractor.py
rename to modules/services/serviceExtraction/extractors/extractorDocx.py
diff --git a/modules/services/serviceExtraction/formats/html_extractor.py b/modules/services/serviceExtraction/extractors/extractorHtml.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/html_extractor.py
rename to modules/services/serviceExtraction/extractors/extractorHtml.py
diff --git a/modules/services/serviceExtraction/formats/image_extractor.py b/modules/services/serviceExtraction/extractors/extractorImage.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/image_extractor.py
rename to modules/services/serviceExtraction/extractors/extractorImage.py
diff --git a/modules/services/serviceExtraction/formats/json_extractor.py b/modules/services/serviceExtraction/extractors/extractorJson.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/json_extractor.py
rename to modules/services/serviceExtraction/extractors/extractorJson.py
diff --git a/modules/services/serviceExtraction/formats/pdf_extractor.py b/modules/services/serviceExtraction/extractors/extractorPdf.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/pdf_extractor.py
rename to modules/services/serviceExtraction/extractors/extractorPdf.py
diff --git a/modules/services/serviceExtraction/formats/pptx_extractor.py b/modules/services/serviceExtraction/extractors/extractorPptx.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/pptx_extractor.py
rename to modules/services/serviceExtraction/extractors/extractorPptx.py
diff --git a/modules/services/serviceExtraction/formats/text_extractor.py b/modules/services/serviceExtraction/extractors/extractorText.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/text_extractor.py
rename to modules/services/serviceExtraction/extractors/extractorText.py
diff --git a/modules/services/serviceExtraction/formats/xlsx_extractor.py b/modules/services/serviceExtraction/extractors/extractorXlsx.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/xlsx_extractor.py
rename to modules/services/serviceExtraction/extractors/extractorXlsx.py
diff --git a/modules/services/serviceExtraction/formats/xml_extractor.py b/modules/services/serviceExtraction/extractors/extractorXml.py
similarity index 100%
rename from modules/services/serviceExtraction/formats/xml_extractor.py
rename to modules/services/serviceExtraction/extractors/extractorXml.py
diff --git a/modules/services/serviceExtraction/merging/default_merger.py b/modules/services/serviceExtraction/merging/mergerDefault.py
similarity index 100%
rename from modules/services/serviceExtraction/merging/default_merger.py
rename to modules/services/serviceExtraction/merging/mergerDefault.py
diff --git a/modules/services/serviceExtraction/merging/table_merger.py b/modules/services/serviceExtraction/merging/mergerTable.py
similarity index 100%
rename from modules/services/serviceExtraction/merging/table_merger.py
rename to modules/services/serviceExtraction/merging/mergerTable.py
diff --git a/modules/services/serviceExtraction/merging/text_merger.py b/modules/services/serviceExtraction/merging/mergerText.py
similarity index 100%
rename from modules/services/serviceExtraction/merging/text_merger.py
rename to modules/services/serviceExtraction/merging/mergerText.py
diff --git a/modules/services/serviceExtraction/intelligent_merger.py b/modules/services/serviceExtraction/subMerger.py
similarity index 100%
rename from modules/services/serviceExtraction/intelligent_merger.py
rename to modules/services/serviceExtraction/subMerger.py
diff --git a/modules/services/serviceExtraction/subPipeline.py b/modules/services/serviceExtraction/subPipeline.py
index c3833fa7..515fd293 100644
--- a/modules/services/serviceExtraction/subPipeline.py
+++ b/modules/services/serviceExtraction/subPipeline.py
@@ -5,10 +5,10 @@ import os
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
-from .merging.text_merger import TextMerger
-from .merging.table_merger import TableMerger
-from .merging.default_merger import DefaultMerger
-from .intelligent_merger import IntelligentTokenAwareMerger
+from .merging.mergerText import TextMerger
+from .merging.mergerTable import TableMerger
+from .merging.mergerDefault import DefaultMerger
+from .subMerger import IntelligentTokenAwareMerger
logger = logging.getLogger(__name__)
@@ -248,13 +248,13 @@ def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[Co
# Check if intelligent merging is enabled
if strategy.get("useIntelligentMerging", False):
model_capabilities = strategy.get("modelCapabilities", {})
- intelligent_merger = IntelligentTokenAwareMerger(model_capabilities)
+ subMerger = IntelligentTokenAwareMerger(model_capabilities)
# Use intelligent merging for all parts
- merged = intelligent_merger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
+ merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
# Calculate and log optimization stats
- stats = intelligent_merger.calculate_optimization_stats(parts, merged)
+ stats = subMerger.calculate_optimization_stats(parts, merged)
logger.info(f"🧠 Intelligent merging stats: {stats}")
print(f"🔍 DEBUG: Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
diff --git a/modules/services/serviceExtraction/subRegistry.py b/modules/services/serviceExtraction/subRegistry.py
index a6bd3445..7f4b9c11 100644
--- a/modules/services/serviceExtraction/subRegistry.py
+++ b/modules/services/serviceExtraction/subRegistry.py
@@ -22,17 +22,17 @@ class ExtractorRegistry:
self._fallback: Optional[Extractor] = None
# Register built-ins
try:
- from .formats.text_extractor import TextExtractor
- from .formats.csv_extractor import CsvExtractor
- from .formats.json_extractor import JsonExtractor
- from .formats.xml_extractor import XmlExtractor
- from .formats.html_extractor import HtmlExtractor
- from .formats.pdf_extractor import PdfExtractor
- from .formats.docx_extractor import DocxExtractor
- from .formats.xlsx_extractor import XlsxExtractor
- from .formats.pptx_extractor import PptxExtractor
- from .formats.image_extractor import ImageExtractor
- from .formats.binary_extractor import BinaryExtractor
+ from .extractors.extractorText import TextExtractor
+ from .extractors.extractorCsv import CsvExtractor
+ from .extractors.extractorJson import JsonExtractor
+ from .extractors.extractorXml import XmlExtractor
+ from .extractors.extractorHtml import HtmlExtractor
+ from .extractors.extractorPdf import PdfExtractor
+ from .extractors.extractorDocx import DocxExtractor
+ from .extractors.extractorXlsx import XlsxExtractor
+ from .extractors.extractorPptx import PptxExtractor
+ from .extractors.extractorImage import ImageExtractor
+ from .extractors.extractorBinary import BinaryExtractor
self.register("text/plain", TextExtractor())
self.register("text/markdown", TextExtractor())
self.register("text/csv", CsvExtractor())
@@ -93,10 +93,10 @@ class ChunkerRegistry:
self._noop = Chunker()
# Register default chunkers
try:
- from .chunking.text_chunker import TextChunker
- from .chunking.table_chunker import TableChunker
- from .chunking.structure_chunker import StructureChunker
- from .chunking.image_chunker import ImageChunker
+ from .chunking.chunkerText import TextChunker
+ from .chunking.chunkerTable import TableChunker
+ from .chunking.chunkerStructure import StructureChunker
+ from .chunking.chunkerImage import ImageChunker
self.register("text", TextChunker())
self.register("table", TableChunker())
self.register("structure", StructureChunker())
diff --git a/modules/services/serviceGeneration/renderers/rendererExcel.py b/modules/services/serviceGeneration/renderers/rendererXlsx.py
similarity index 99%
rename from modules/services/serviceGeneration/renderers/rendererExcel.py
rename to modules/services/serviceGeneration/renderers/rendererXlsx.py
index 6ea4ff32..9885988d 100644
--- a/modules/services/serviceGeneration/renderers/rendererExcel.py
+++ b/modules/services/serviceGeneration/renderers/rendererXlsx.py
@@ -17,7 +17,7 @@ try:
except ImportError:
OPENPYXL_AVAILABLE = False
-class RendererExcel(BaseRenderer):
+class RendererXlsx(BaseRenderer):
"""Renders content to Excel format using openpyxl."""
@classmethod