gateway/modules/services/serviceExtraction/subRegistry.py

103 lines
4 KiB
Python

from typing import Any, Dict, Optional
from .types import ContentPart
class Extractor:
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return False
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
raise NotImplementedError
class Chunker:
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
return []
class ExtractorRegistry:
def __init__(self):
self._map: Dict[str, Extractor] = {}
self._fallback: Optional[Extractor] = None
# Register built-ins
try:
from .formats.text_extractor import TextExtractor
from .formats.csv_extractor import CsvExtractor
from .formats.json_extractor import JsonExtractor
from .formats.xml_extractor import XmlExtractor
from .formats.html_extractor import HtmlExtractor
from .formats.pdf_extractor import PdfExtractor
from .formats.docx_extractor import DocxExtractor
from .formats.xlsx_extractor import XlsxExtractor
from .formats.image_extractor import ImageExtractor
from .formats.binary_extractor import BinaryExtractor
self.register("text/plain", TextExtractor())
self.register("text/markdown", TextExtractor())
self.register("text/csv", CsvExtractor())
self.register("application/json", JsonExtractor())
self.register("application/xml", XmlExtractor())
self.register("text/html", HtmlExtractor())
self.register("application/pdf", PdfExtractor())
self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
# images
self.register("image/jpeg", ImageExtractor())
self.register("image/png", ImageExtractor())
self.register("image/gif", ImageExtractor())
# extension fallbacks
self.register("txt", TextExtractor())
self.register("md", TextExtractor())
self.register("csv", CsvExtractor())
self.register("json", JsonExtractor())
self.register("xml", XmlExtractor())
self.register("html", HtmlExtractor())
self.register("htm", HtmlExtractor())
self.register("pdf", PdfExtractor())
self.register("docx", DocxExtractor())
self.register("xlsx", XlsxExtractor())
self.register("xlsm", XlsxExtractor())
# fallback
self.setFallback(BinaryExtractor())
except Exception:
pass
def register(self, key: str, extractor: Extractor):
self._map[key] = extractor
def setFallback(self, extractor: Extractor):
self._fallback = extractor
def resolve(self, mimeType: str, fileName: str) -> Optional[Extractor]:
if mimeType in self._map:
return self._map[mimeType]
# simple extension fallback
if "." in fileName:
ext = fileName.lower().rsplit(".", 1)[-1]
if ext in self._map:
return self._map[ext]
return self._fallback
class ChunkerRegistry:
def __init__(self):
self._map: Dict[str, Chunker] = {}
self._noop = Chunker()
# Register default chunkers
try:
from .chunking.text_chunker import TextChunker
from .chunking.table_chunker import TableChunker
from .chunking.structure_chunker import StructureChunker
self.register("text", TextChunker())
self.register("table", TableChunker())
self.register("structure", StructureChunker())
except Exception:
pass
def register(self, typeGroup: str, chunker: Chunker):
self._map[typeGroup] = chunker
def resolve(self, typeGroup: str) -> Chunker:
return self._map.get(typeGroup, self._noop)