from typing import Any, Dict, Optional from modules.datamodels.datamodelExtraction import ContentPart class Extractor: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return False def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]: raise NotImplementedError class Chunker: def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]: return [] class ExtractorRegistry: def __init__(self): self._map: Dict[str, Extractor] = {} self._fallback: Optional[Extractor] = None # Register built-ins try: from .formats.text_extractor import TextExtractor from .formats.csv_extractor import CsvExtractor from .formats.json_extractor import JsonExtractor from .formats.xml_extractor import XmlExtractor from .formats.html_extractor import HtmlExtractor from .formats.pdf_extractor import PdfExtractor from .formats.docx_extractor import DocxExtractor from .formats.xlsx_extractor import XlsxExtractor from .formats.image_extractor import ImageExtractor from .formats.binary_extractor import BinaryExtractor self.register("text/plain", TextExtractor()) self.register("text/markdown", TextExtractor()) self.register("text/csv", CsvExtractor()) self.register("application/json", JsonExtractor()) self.register("application/xml", XmlExtractor()) self.register("text/html", HtmlExtractor()) self.register("application/pdf", PdfExtractor()) self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor()) self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor()) # images self.register("image/jpeg", ImageExtractor()) self.register("image/png", ImageExtractor()) self.register("image/gif", ImageExtractor()) # extension fallbacks self.register("txt", TextExtractor()) self.register("md", TextExtractor()) self.register("csv", CsvExtractor()) self.register("json", JsonExtractor()) self.register("xml", XmlExtractor()) self.register("html", HtmlExtractor()) self.register("htm", HtmlExtractor()) self.register("pdf", PdfExtractor()) self.register("docx", DocxExtractor()) self.register("xlsx", XlsxExtractor()) self.register("xlsm", XlsxExtractor()) # fallback self.setFallback(BinaryExtractor()) except Exception: pass def register(self, key: str, extractor: Extractor): self._map[key] = extractor def setFallback(self, extractor: Extractor): self._fallback = extractor def resolve(self, mimeType: str, fileName: str) -> Optional[Extractor]: if mimeType in self._map: return self._map[mimeType] # simple extension fallback if "." in fileName: ext = fileName.lower().rsplit(".", 1)[-1] if ext in self._map: return self._map[ext] return self._fallback class ChunkerRegistry: def __init__(self): self._map: Dict[str, Chunker] = {} self._noop = Chunker() # Register default chunkers try: from .chunking.text_chunker import TextChunker from .chunking.table_chunker import TableChunker from .chunking.structure_chunker import StructureChunker self.register("text", TextChunker()) self.register("table", TableChunker()) self.register("structure", StructureChunker()) except Exception: pass def register(self, typeGroup: str, chunker: Chunker): self._map[typeGroup] = chunker def resolve(self, typeGroup: str) -> Chunker: return self._map.get(typeGroup, self._noop)