from typing import Any, Dict, Optional import logging from modules.datamodels.datamodelExtraction import ContentPart logger = logging.getLogger(__name__) class Extractor: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return False def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]: raise NotImplementedError class Chunker: def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]: return [] class ExtractorRegistry: def __init__(self): self._map: Dict[str, Extractor] = {} self._fallback: Optional[Extractor] = None # Register built-ins try: from .extractors.extractorText import TextExtractor from .extractors.extractorCsv import CsvExtractor from .extractors.extractorJson import JsonExtractor from .extractors.extractorXml import XmlExtractor from .extractors.extractorHtml import HtmlExtractor from .extractors.extractorPdf import PdfExtractor from .extractors.extractorDocx import DocxExtractor from .extractors.extractorXlsx import XlsxExtractor from .extractors.extractorPptx import PptxExtractor from .extractors.extractorImage import ImageExtractor from .extractors.extractorBinary import BinaryExtractor self.register("text/plain", TextExtractor()) self.register("text/markdown", TextExtractor()) self.register("text/csv", CsvExtractor()) self.register("application/json", JsonExtractor()) self.register("application/xml", XmlExtractor()) self.register("text/html", HtmlExtractor()) self.register("application/pdf", PdfExtractor()) self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor()) self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor()) self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor()) self.register("application/vnd.ms-powerpoint", PptxExtractor()) # images self.register("image/jpeg", ImageExtractor()) self.register("image/png", ImageExtractor()) self.register("image/gif", ImageExtractor()) # extension fallbacks self.register("txt", TextExtractor()) self.register("md", TextExtractor()) self.register("csv", CsvExtractor()) self.register("json", JsonExtractor()) self.register("xml", XmlExtractor()) self.register("html", HtmlExtractor()) self.register("htm", HtmlExtractor()) self.register("pdf", PdfExtractor()) self.register("docx", DocxExtractor()) self.register("xlsx", XlsxExtractor()) self.register("xlsm", XlsxExtractor()) self.register("pptx", PptxExtractor()) self.register("ppt", PptxExtractor()) # fallback self.setFallback(BinaryExtractor()) logger.info(f"ExtractorRegistry: Successfully registered {len(self._map)} extractors") except Exception as e: logger.error(f"ExtractorRegistry: Failed to register extractors: {str(e)}") import traceback traceback.print_exc() def register(self, key: str, extractor: Extractor): self._map[key] = extractor def setFallback(self, extractor: Extractor): self._fallback = extractor def resolve(self, mimeType: str, fileName: str) -> Optional[Extractor]: if mimeType in self._map: return self._map[mimeType] # simple extension fallback if "." in fileName: ext = fileName.lower().rsplit(".", 1)[-1] if ext in self._map: return self._map[ext] return self._fallback class ChunkerRegistry: def __init__(self): self._map: Dict[str, Chunker] = {} self._noop = Chunker() # Register default chunkers try: from .chunking.chunkerText import TextChunker from .chunking.chunkerTable import TableChunker from .chunking.chunkerStructure import StructureChunker from .chunking.chunkerImage import ImageChunker self.register("text", TextChunker()) self.register("table", TableChunker()) self.register("structure", StructureChunker()) self.register("image", ImageChunker()) # Use text chunker for container and binary content self.register("container", TextChunker()) self.register("binary", TextChunker()) except Exception as e: logger.error(f"ChunkerRegistry: Failed to register chunkers: {str(e)}") import traceback traceback.print_exc() def register(self, typeGroup: str, chunker: Chunker): self._map[typeGroup] = chunker def resolve(self, typeGroup: str) -> Chunker: return self._map.get(typeGroup, self._noop)