121 lines
5.1 KiB
Python
121 lines
5.1 KiB
Python
from typing import Any, Dict, Optional
|
|
import logging
|
|
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Extractor:
|
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
|
return False
|
|
|
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
|
|
raise NotImplementedError
|
|
|
|
|
|
class Chunker:
|
|
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
|
return []
|
|
|
|
|
|
class ExtractorRegistry:
|
|
def __init__(self):
|
|
self._map: Dict[str, Extractor] = {}
|
|
self._fallback: Optional[Extractor] = None
|
|
# Register built-ins
|
|
try:
|
|
from .extractors.extractorText import TextExtractor
|
|
from .extractors.extractorCsv import CsvExtractor
|
|
from .extractors.extractorJson import JsonExtractor
|
|
from .extractors.extractorXml import XmlExtractor
|
|
from .extractors.extractorHtml import HtmlExtractor
|
|
from .extractors.extractorPdf import PdfExtractor
|
|
from .extractors.extractorDocx import DocxExtractor
|
|
from .extractors.extractorXlsx import XlsxExtractor
|
|
from .extractors.extractorPptx import PptxExtractor
|
|
from .extractors.extractorImage import ImageExtractor
|
|
from .extractors.extractorBinary import BinaryExtractor
|
|
self.register("text/plain", TextExtractor())
|
|
self.register("text/markdown", TextExtractor())
|
|
self.register("text/csv", CsvExtractor())
|
|
self.register("application/json", JsonExtractor())
|
|
self.register("application/xml", XmlExtractor())
|
|
self.register("text/html", HtmlExtractor())
|
|
self.register("application/pdf", PdfExtractor())
|
|
self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
|
|
self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
|
|
self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor())
|
|
self.register("application/vnd.ms-powerpoint", PptxExtractor())
|
|
# images
|
|
self.register("image/jpeg", ImageExtractor())
|
|
self.register("image/png", ImageExtractor())
|
|
self.register("image/gif", ImageExtractor())
|
|
# extension fallbacks
|
|
self.register("txt", TextExtractor())
|
|
self.register("md", TextExtractor())
|
|
self.register("csv", CsvExtractor())
|
|
self.register("json", JsonExtractor())
|
|
self.register("xml", XmlExtractor())
|
|
self.register("html", HtmlExtractor())
|
|
self.register("htm", HtmlExtractor())
|
|
self.register("pdf", PdfExtractor())
|
|
self.register("docx", DocxExtractor())
|
|
self.register("xlsx", XlsxExtractor())
|
|
self.register("xlsm", XlsxExtractor())
|
|
self.register("pptx", PptxExtractor())
|
|
self.register("ppt", PptxExtractor())
|
|
# fallback
|
|
self.setFallback(BinaryExtractor())
|
|
logger.info(f"ExtractorRegistry: Successfully registered {len(self._map)} extractors")
|
|
except Exception as e:
|
|
logger.error(f"ExtractorRegistry: Failed to register extractors: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def register(self, key: str, extractor: Extractor):
|
|
self._map[key] = extractor
|
|
|
|
def setFallback(self, extractor: Extractor):
|
|
self._fallback = extractor
|
|
|
|
def resolve(self, mimeType: str, fileName: str) -> Optional[Extractor]:
|
|
if mimeType in self._map:
|
|
return self._map[mimeType]
|
|
# simple extension fallback
|
|
if "." in fileName:
|
|
ext = fileName.lower().rsplit(".", 1)[-1]
|
|
if ext in self._map:
|
|
return self._map[ext]
|
|
return self._fallback
|
|
|
|
|
|
class ChunkerRegistry:
|
|
def __init__(self):
|
|
self._map: Dict[str, Chunker] = {}
|
|
self._noop = Chunker()
|
|
# Register default chunkers
|
|
try:
|
|
from .chunking.chunkerText import TextChunker
|
|
from .chunking.chunkerTable import TableChunker
|
|
from .chunking.chunkerStructure import StructureChunker
|
|
from .chunking.chunkerImage import ImageChunker
|
|
self.register("text", TextChunker())
|
|
self.register("table", TableChunker())
|
|
self.register("structure", StructureChunker())
|
|
self.register("image", ImageChunker())
|
|
# Use text chunker for container and binary content
|
|
self.register("container", TextChunker())
|
|
self.register("binary", TextChunker())
|
|
except Exception as e:
|
|
logger.error(f"ChunkerRegistry: Failed to register chunkers: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def register(self, typeGroup: str, chunker: Chunker):
|
|
self._map[typeGroup] = chunker
|
|
|
|
def resolve(self, typeGroup: str) -> Chunker:
|
|
return self._map.get(typeGroup, self._noop)
|
|
|
|
|