gateway/modules/services/serviceExtraction/subRegistry.py

121 lines
5.1 KiB
Python

from typing import Any, Dict, Optional
import logging
from modules.datamodels.datamodelExtraction import ContentPart
logger = logging.getLogger(__name__)
class Extractor:
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return False
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
raise NotImplementedError
class Chunker:
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
return []
class ExtractorRegistry:
def __init__(self):
self._map: Dict[str, Extractor] = {}
self._fallback: Optional[Extractor] = None
# Register built-ins
try:
from .extractors.extractorText import TextExtractor
from .extractors.extractorCsv import CsvExtractor
from .extractors.extractorJson import JsonExtractor
from .extractors.extractorXml import XmlExtractor
from .extractors.extractorHtml import HtmlExtractor
from .extractors.extractorPdf import PdfExtractor
from .extractors.extractorDocx import DocxExtractor
from .extractors.extractorXlsx import XlsxExtractor
from .extractors.extractorPptx import PptxExtractor
from .extractors.extractorImage import ImageExtractor
from .extractors.extractorBinary import BinaryExtractor
self.register("text/plain", TextExtractor())
self.register("text/markdown", TextExtractor())
self.register("text/csv", CsvExtractor())
self.register("application/json", JsonExtractor())
self.register("application/xml", XmlExtractor())
self.register("text/html", HtmlExtractor())
self.register("application/pdf", PdfExtractor())
self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor())
self.register("application/vnd.ms-powerpoint", PptxExtractor())
# images
self.register("image/jpeg", ImageExtractor())
self.register("image/png", ImageExtractor())
self.register("image/gif", ImageExtractor())
# extension fallbacks
self.register("txt", TextExtractor())
self.register("md", TextExtractor())
self.register("csv", CsvExtractor())
self.register("json", JsonExtractor())
self.register("xml", XmlExtractor())
self.register("html", HtmlExtractor())
self.register("htm", HtmlExtractor())
self.register("pdf", PdfExtractor())
self.register("docx", DocxExtractor())
self.register("xlsx", XlsxExtractor())
self.register("xlsm", XlsxExtractor())
self.register("pptx", PptxExtractor())
self.register("ppt", PptxExtractor())
# fallback
self.setFallback(BinaryExtractor())
logger.info(f"ExtractorRegistry: Successfully registered {len(self._map)} extractors")
except Exception as e:
logger.error(f"ExtractorRegistry: Failed to register extractors: {str(e)}")
import traceback
traceback.print_exc()
def register(self, key: str, extractor: Extractor):
self._map[key] = extractor
def setFallback(self, extractor: Extractor):
self._fallback = extractor
def resolve(self, mimeType: str, fileName: str) -> Optional[Extractor]:
if mimeType in self._map:
return self._map[mimeType]
# simple extension fallback
if "." in fileName:
ext = fileName.lower().rsplit(".", 1)[-1]
if ext in self._map:
return self._map[ext]
return self._fallback
class ChunkerRegistry:
def __init__(self):
self._map: Dict[str, Chunker] = {}
self._noop = Chunker()
# Register default chunkers
try:
from .chunking.chunkerText import TextChunker
from .chunking.chunkerTable import TableChunker
from .chunking.chunkerStructure import StructureChunker
from .chunking.chunkerImage import ImageChunker
self.register("text", TextChunker())
self.register("table", TableChunker())
self.register("structure", StructureChunker())
self.register("image", ImageChunker())
# Use text chunker for container and binary content
self.register("container", TextChunker())
self.register("binary", TextChunker())
except Exception as e:
logger.error(f"ChunkerRegistry: Failed to register chunkers: {str(e)}")
import traceback
traceback.print_exc()
def register(self, typeGroup: str, chunker: Chunker):
self._map[typeGroup] = chunker
def resolve(self, typeGroup: str) -> Chunker:
return self._map.get(typeGroup, self._noop)