refactored plug and play document engine
This commit is contained in:
parent
5537d3e704
commit
0e79024b07
23 changed files with 23 additions and 23 deletions
|
|
@ -5,10 +5,10 @@ import os
|
||||||
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
|
||||||
from .subUtils import makeId
|
from .subUtils import makeId
|
||||||
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
||||||
from .merging.text_merger import TextMerger
|
from .merging.mergerText import TextMerger
|
||||||
from .merging.table_merger import TableMerger
|
from .merging.mergerTable import TableMerger
|
||||||
from .merging.default_merger import DefaultMerger
|
from .merging.mergerDefault import DefaultMerger
|
||||||
from .intelligent_merger import IntelligentTokenAwareMerger
|
from .subMerger import IntelligentTokenAwareMerger
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -248,13 +248,13 @@ def _applyMerging(parts: List[ContentPart], strategy: Dict[str, Any]) -> List[Co
|
||||||
# Check if intelligent merging is enabled
|
# Check if intelligent merging is enabled
|
||||||
if strategy.get("useIntelligentMerging", False):
|
if strategy.get("useIntelligentMerging", False):
|
||||||
model_capabilities = strategy.get("modelCapabilities", {})
|
model_capabilities = strategy.get("modelCapabilities", {})
|
||||||
intelligent_merger = IntelligentTokenAwareMerger(model_capabilities)
|
subMerger = IntelligentTokenAwareMerger(model_capabilities)
|
||||||
|
|
||||||
# Use intelligent merging for all parts
|
# Use intelligent merging for all parts
|
||||||
merged = intelligent_merger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
|
merged = subMerger.merge_chunks_intelligently(parts, strategy.get("prompt", ""))
|
||||||
|
|
||||||
# Calculate and log optimization stats
|
# Calculate and log optimization stats
|
||||||
stats = intelligent_merger.calculate_optimization_stats(parts, merged)
|
stats = subMerger.calculate_optimization_stats(parts, merged)
|
||||||
logger.info(f"🧠 Intelligent merging stats: {stats}")
|
logger.info(f"🧠 Intelligent merging stats: {stats}")
|
||||||
print(f"🔍 DEBUG: Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
|
print(f"🔍 DEBUG: Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,17 +22,17 @@ class ExtractorRegistry:
|
||||||
self._fallback: Optional[Extractor] = None
|
self._fallback: Optional[Extractor] = None
|
||||||
# Register built-ins
|
# Register built-ins
|
||||||
try:
|
try:
|
||||||
from .formats.text_extractor import TextExtractor
|
from .extractors.extractorText import TextExtractor
|
||||||
from .formats.csv_extractor import CsvExtractor
|
from .extractors.extractorCsv import CsvExtractor
|
||||||
from .formats.json_extractor import JsonExtractor
|
from .extractors.extractorJson import JsonExtractor
|
||||||
from .formats.xml_extractor import XmlExtractor
|
from .extractors.extractorXml import XmlExtractor
|
||||||
from .formats.html_extractor import HtmlExtractor
|
from .extractors.extractorHtml import HtmlExtractor
|
||||||
from .formats.pdf_extractor import PdfExtractor
|
from .extractors.extractorPdf import PdfExtractor
|
||||||
from .formats.docx_extractor import DocxExtractor
|
from .extractors.extractorDocx import DocxExtractor
|
||||||
from .formats.xlsx_extractor import XlsxExtractor
|
from .extractors.extractorXlsx import XlsxExtractor
|
||||||
from .formats.pptx_extractor import PptxExtractor
|
from .extractors.extractorPptx import PptxExtractor
|
||||||
from .formats.image_extractor import ImageExtractor
|
from .extractors.extractorImage import ImageExtractor
|
||||||
from .formats.binary_extractor import BinaryExtractor
|
from .extractors.extractorBinary import BinaryExtractor
|
||||||
self.register("text/plain", TextExtractor())
|
self.register("text/plain", TextExtractor())
|
||||||
self.register("text/markdown", TextExtractor())
|
self.register("text/markdown", TextExtractor())
|
||||||
self.register("text/csv", CsvExtractor())
|
self.register("text/csv", CsvExtractor())
|
||||||
|
|
@ -93,10 +93,10 @@ class ChunkerRegistry:
|
||||||
self._noop = Chunker()
|
self._noop = Chunker()
|
||||||
# Register default chunkers
|
# Register default chunkers
|
||||||
try:
|
try:
|
||||||
from .chunking.text_chunker import TextChunker
|
from .chunking.chunkerText import TextChunker
|
||||||
from .chunking.table_chunker import TableChunker
|
from .chunking.chunkerTable import TableChunker
|
||||||
from .chunking.structure_chunker import StructureChunker
|
from .chunking.chunkerStructure import StructureChunker
|
||||||
from .chunking.image_chunker import ImageChunker
|
from .chunking.chunkerImage import ImageChunker
|
||||||
self.register("text", TextChunker())
|
self.register("text", TextChunker())
|
||||||
self.register("table", TableChunker())
|
self.register("table", TableChunker())
|
||||||
self.register("structure", StructureChunker())
|
self.register("structure", StructureChunker())
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
OPENPYXL_AVAILABLE = False
|
OPENPYXL_AVAILABLE = False
|
||||||
|
|
||||||
class RendererExcel(BaseRenderer):
|
class RendererXlsx(BaseRenderer):
|
||||||
"""Renders content to Excel format using openpyxl."""
|
"""Renders content to Excel format using openpyxl."""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
Loading…
Reference in a new issue