AI system tested for all file types

This commit is contained in:
ValueOn AG 2025-10-13 22:03:28 +02:00
parent 2e471ca3f7
commit 0c357dc8a9
15 changed files with 588 additions and 52 deletions

View file

@ -7,8 +7,28 @@ from ..subRegistry import Extractor
class BinaryExtractor(Extractor): class BinaryExtractor(Extractor):
"""
Fallback extractor for unsupported file types.
This extractor handles any file type that doesn't match other extractors.
It encodes the file as base64 and marks it as binary data.
Supported formats:
- All file types (fallback)
- MIME types: application/octet-stream (default)
- File extensions: All (fallback)
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return True return True
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions (all)."""
return [] # Accepts all extensions as fallback
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types (all)."""
return [] # Accepts all MIME types as fallback
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/octet-stream" mimeType = context.get("mimeType") or "application/octet-stream"

View file

@ -6,8 +6,25 @@ from ..subRegistry import Extractor
class CsvExtractor(Extractor): class CsvExtractor(Extractor):
"""
Extractor for CSV files.
Supported formats:
- MIME types: text/csv
- File extensions: .csv
- Special handling: Treats as table data
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv") return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".csv"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/csv"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName") fileName = context.get("fileName")

View file

@ -7,6 +7,16 @@ from ..subRegistry import Extractor
class DocxExtractor(Extractor): class DocxExtractor(Extractor):
"""
Extractor for Microsoft Word documents.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
- File extensions: .docx
- Special handling: Extracts paragraphs and tables (converts tables to CSV)
- Dependencies: python-docx
"""
def __init__(self): def __init__(self):
self._loaded = False self._loaded = False
self._haveLibs = False self._haveLibs = False
@ -24,6 +34,14 @@ class DocxExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx") return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".docx"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load() self._load()

View file

@ -7,8 +7,26 @@ from ..subRegistry import Extractor
class HtmlExtractor(Extractor): class HtmlExtractor(Extractor):
"""
Extractor for HTML files.
Supported formats:
- MIME types: text/html
- File extensions: .html, .htm
- Special handling: Uses BeautifulSoup for parsing
- Dependencies: beautifulsoup4
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm")) return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".html", ".htm"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/html"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "text/html" mimeType = context.get("mimeType") or "text/html"

View file

@ -10,8 +10,26 @@ logger = logging.getLogger(__name__)
class ImageExtractor(Extractor): class ImageExtractor(Extractor):
"""
Extractor for image files.
Supported formats:
- MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
- File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
- Special handling: GIF files are converted to PNG during extraction
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return (mimeType or "").startswith("image/") return ((mimeType or "").startswith("image/") or
(fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "image/unknown" mimeType = context.get("mimeType") or "image/unknown"

View file

@ -7,8 +7,25 @@ from ..subRegistry import Extractor
class JsonExtractor(Extractor): class JsonExtractor(Extractor):
"""
Extractor for JSON files.
Supported formats:
- MIME types: application/json
- File extensions: .json
- Special handling: Validates JSON format, falls back to text if invalid
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/json" or (fileName or "").lower().endswith(".json") return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".json"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/json"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/json" mimeType = context.get("mimeType") or "application/json"

View file

@ -8,6 +8,16 @@ from ..subRegistry import Extractor
class PdfExtractor(Extractor): class PdfExtractor(Extractor):
"""
Extractor for PDF files.
Supported formats:
- MIME types: application/pdf
- File extensions: .pdf
- Special handling: Extracts text per page and embedded images
- Dependencies: PyPDF2, PyMuPDF (fitz)
"""
def __init__(self): def __init__(self):
self._loaded = False self._loaded = False
self._haveLibs = False self._haveLibs = False
@ -26,6 +36,14 @@ class PdfExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf") return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".pdf"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/pdf"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load() self._load()

View file

@ -8,7 +8,15 @@ logger = logging.getLogger(__name__)
class PptxExtractor(Extractor): class PptxExtractor(Extractor):
"""Extractor for PowerPoint (.pptx) files using python-pptx library.""" """
Extractor for PowerPoint files.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
- File extensions: .pptx, .ppt
- Special handling: Extracts slide content, tables, and images
- Dependencies: python-pptx
"""
def __init__(self): def __init__(self):
self._loaded = False self._loaded = False
@ -31,6 +39,17 @@ class PptxExtractor(Extractor):
"application/vnd.ms-powerpoint" "application/vnd.ms-powerpoint"
]) or (fileName or "").lower().endswith((".pptx", ".ppt")) ]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".pptx", ".ppt"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint"
]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
""" """
Extract content from PowerPoint files. Extract content from PowerPoint files.

View file

@ -0,0 +1,56 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class SqlExtractor(Extractor):
"""
Extractor for SQL files.
Supported formats:
- MIME types: text/x-sql, application/sql
- File extensions: .sql, .ddl, .dml, .dcl, .tcl
- Special handling: Treats as structured text with SQL syntax
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return (mimeType in ("text/x-sql", "application/sql") or
(fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/x-sql", "application/sql"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
mimeType = context.get("mimeType") or "text/x-sql"
data = fileBytes.decode("utf-8", errors="replace")
# Add SQL-specific metadata
metadata = {
"size": len(fileBytes),
"file_type": "sql",
"line_count": len(data.splitlines()),
"has_select": "SELECT" in data.upper(),
"has_insert": "INSERT" in data.upper(),
"has_update": "UPDATE" in data.upper(),
"has_delete": "DELETE" in data.upper(),
"has_create": "CREATE" in data.upper(),
"has_drop": "DROP" in data.upper()
}
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="structure",
mimeType=mimeType,
data=data,
metadata=metadata
)]

View file

@ -6,8 +6,85 @@ from ..subRegistry import Extractor
class TextExtractor(Extractor): class TextExtractor(Extractor):
"""
Extractor for plain text files and code files.
Supported formats:
- MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
- File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType in ("text/plain", "text/markdown") # Check MIME types
if mimeType and mimeType.startswith("text/"):
return True
# Check file extensions
if fileName:
ext = fileName.lower()
return ext.endswith((
# Basic text files
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
# Programming languages
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
# Web technologies
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
# Configuration files
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
# Scripts and automation
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
# Data files
".csv", ".tsv", ".tab", ".dat", ".data",
# Documentation
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
# Other text formats
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
".env", ".env.local", ".env.development", ".env.production", ".env.test",
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
))
return False
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [
# Basic text files
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
# Programming languages
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
# Web technologies
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
# Configuration files
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
# Scripts and automation
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
# Data files
".csv", ".tsv", ".tab", ".dat", ".data",
# Documentation
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
# Other text formats
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
".env", ".env.local", ".env.development", ".env.production", ".env.test",
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return [
"text/plain", "text/markdown", "text/x-python", "text/x-java-source",
"text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
"text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
"text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
"text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
"text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
"text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
"text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
"text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
"application/x-yaml", "application/x-toml", "application/x-ini",
"application/x-config", "application/x-properties", "application/x-log"
]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName") fileName = context.get("fileName")

View file

@ -8,6 +8,16 @@ from ..subRegistry import Extractor
class XlsxExtractor(Extractor): class XlsxExtractor(Extractor):
"""
Extractor for Microsoft Excel spreadsheets.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
- File extensions: .xlsx, .xlsm
- Special handling: Extracts all sheets as CSV data
- Dependencies: openpyxl
"""
def __init__(self): def __init__(self):
self._loaded = False self._loaded = False
self._haveLibs = False self._haveLibs = False
@ -26,6 +36,14 @@ class XlsxExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm")) return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".xlsx", ".xlsm"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load() self._load()

View file

@ -7,8 +7,25 @@ from ..subRegistry import Extractor
class XmlExtractor(Extractor): class XmlExtractor(Extractor):
"""
Extractor for XML files.
Supported formats:
- MIME types: application/xml
- File extensions: .xml, .rss, .atom
- Special handling: Uses ElementTree for parsing
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom")) return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".xml", ".rss", ".atom"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/xml"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/xml" mimeType = context.get("mimeType") or "application/xml"

View file

@ -7,11 +7,31 @@ logger = logging.getLogger(__name__)
class Extractor: class Extractor:
"""
Base class for all document extractors.
Each extractor should implement:
- detect(): Check if this extractor can handle the given file
- extract(): Extract content from the file
- getSupportedExtensions(): Return supported file extensions
- getSupportedMimeTypes(): Return supported MIME types
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
"""Check if this extractor can handle the given file."""
return False return False
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
"""Extract content from the file bytes."""
raise NotImplementedError raise NotImplementedError
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions (including dots)."""
return []
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return []
class Chunker: class Chunker:
@ -23,55 +43,85 @@ class ExtractorRegistry:
def __init__(self): def __init__(self):
self._map: Dict[str, Extractor] = {} self._map: Dict[str, Extractor] = {}
self._fallback: Optional[Extractor] = None self._fallback: Optional[Extractor] = None
# Register built-ins self._auto_discover_extractors()
def _auto_discover_extractors(self):
"""Auto-discover and register all extractors from the extractors directory."""
try: try:
from .extractors.extractorText import TextExtractor import os
from .extractors.extractorCsv import CsvExtractor import importlib
from .extractors.extractorJson import JsonExtractor from pathlib import Path
from .extractors.extractorXml import XmlExtractor
from .extractors.extractorHtml import HtmlExtractor # Get the extractors directory
from .extractors.extractorPdf import PdfExtractor current_dir = Path(__file__).parent
from .extractors.extractorDocx import DocxExtractor extractors_dir = current_dir / "extractors"
from .extractors.extractorXlsx import XlsxExtractor
from .extractors.extractorPptx import PptxExtractor if not extractors_dir.exists():
from .extractors.extractorImage import ImageExtractor logger.error(f"Extractors directory not found: {extractors_dir}")
from .extractors.extractorBinary import BinaryExtractor return
self.register("text/plain", TextExtractor())
self.register("text/markdown", TextExtractor()) # Import all extractor modules
self.register("text/csv", CsvExtractor()) extractor_modules = []
self.register("application/json", JsonExtractor()) for file_path in extractors_dir.glob("extractor*.py"):
self.register("application/xml", XmlExtractor()) if file_path.name == "__init__.py":
self.register("text/html", HtmlExtractor()) continue
self.register("application/pdf", PdfExtractor())
self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor()) module_name = file_path.stem
self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor()) try:
self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor()) # Import the module
self.register("application/vnd.ms-powerpoint", PptxExtractor()) module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
# images
self.register("image/jpeg", ImageExtractor()) # Find all extractor classes in the module
self.register("image/png", ImageExtractor()) for attr_name in dir(module):
self.register("image/gif", ImageExtractor()) attr = getattr(module, attr_name)
# extension fallbacks if (isinstance(attr, type) and
self.register("txt", TextExtractor()) issubclass(attr, Extractor) and
self.register("md", TextExtractor()) attr != Extractor and
self.register("csv", CsvExtractor()) not attr_name.startswith('_')):
self.register("json", JsonExtractor())
self.register("xml", XmlExtractor()) # Create instance and auto-register
self.register("html", HtmlExtractor()) extractor_instance = attr()
self.register("htm", HtmlExtractor()) self._auto_register_extractor(extractor_instance)
self.register("pdf", PdfExtractor()) extractor_modules.append(attr_name)
self.register("docx", DocxExtractor())
self.register("xlsx", XlsxExtractor()) except Exception as e:
self.register("xlsm", XlsxExtractor()) logger.warning(f"Failed to import {module_name}: {str(e)}")
self.register("pptx", PptxExtractor()) continue
self.register("ppt", PptxExtractor())
# fallback # Set fallback extractor
self.setFallback(BinaryExtractor()) try:
logger.info(f"ExtractorRegistry: Successfully registered {len(self._map)} extractors") from .extractors.extractorBinary import BinaryExtractor
self.setFallback(BinaryExtractor())
except Exception as e:
logger.warning(f"Failed to set fallback extractor: {str(e)}")
logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
except Exception as e: except Exception as e:
logger.error(f"ExtractorRegistry: Failed to register extractors: {str(e)}") logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
import traceback import traceback
traceback.print_exc() traceback.print_exc()
def _auto_register_extractor(self, extractor: Extractor):
"""Auto-register an extractor based on its declared supported formats."""
try:
# Register MIME types
mime_types = extractor.getSupportedMimeTypes()
for mime_type in mime_types:
self.register(mime_type, extractor)
logger.debug(f"Registered MIME type: {mime_type}{extractor.__class__.__name__}")
# Register file extensions
extensions = extractor.getSupportedExtensions()
for ext in extensions:
# Remove leading dot for registry key
ext_key = ext.lstrip('.')
self.register(ext_key, extractor)
logger.debug(f"Registered extension: .{ext_key}{extractor.__class__.__name__}")
except Exception as e:
logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
def register(self, key: str, extractor: Extractor): def register(self, key: str, extractor: Extractor):
self._map[key] = extractor self._map[key] = extractor
@ -88,6 +138,43 @@ class ExtractorRegistry:
if ext in self._map: if ext in self._map:
return self._map[ext] return self._map[ext]
return self._fallback return self._fallback
def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
"""
Get all supported formats from all registered extractors.
Returns:
Dictionary with format information:
{
"extensions": {
"extractor_name": [".ext1", ".ext2", ...]
},
"mime_types": {
"extractor_name": ["mime/type1", "mime/type2", ...]
}
}
"""
formats = {"extensions": {}, "mime_types": {}}
# Get formats from registered extractors
for key, extractor in self._map.items():
if hasattr(extractor, 'getSupportedExtensions'):
extensions = extractor.getSupportedExtensions()
if extensions:
formats["extensions"][key] = extensions
if hasattr(extractor, 'getSupportedMimeTypes'):
mime_types = extractor.getSupportedMimeTypes()
if mime_types:
formats["mime_types"][key] = mime_types
# Add fallback extractor info
if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
return formats
class ChunkerRegistry: class ChunkerRegistry:

View file

@ -51,7 +51,24 @@ async def process_documents_and_generate_summary():
return False return False
# Find all supported document files # Find all supported document files
supported_extensions = ["*.pdf", "*.jpg", "*.jpeg", "*.png", "*.gif", "*.docx", "*.xlsx", "*.pptx", "*.ppt", "*.txt", "*.md", "*.html", "*.csv"] supported_extensions = [
# Document formats
"*.pdf", "*.docx", "*.xlsx", "*.pptx", "*.ppt",
# Image formats
"*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.bmp", "*.tiff",
# Text and code files
"*.txt", "*.md", "*.log", "*.rtf", "*.tex", "*.rst", "*.adoc", "*.org", "*.pod",
"*.java", "*.js", "*.jsx", "*.ts", "*.tsx", "*.py", "*.rb", "*.go", "*.rs", "*.cpp", "*.c", "*.h", "*.hpp", "*.cc", "*.cxx",
"*.cs", "*.php", "*.swift", "*.kt", "*.scala", "*.clj", "*.hs", "*.ml", "*.fs", "*.vb", "*.dart", "*.r", "*.m", "*.pl", "*.sh",
"*.html", "*.htm", "*.css", "*.scss", "*.sass", "*.less", "*.vue", "*.svelte",
"*.config", "*.ini", "*.cfg", "*.conf", "*.properties", "*.yaml", "*.yml", "*.toml", "*.json", "*.xml",
"*.bat", "*.ps1", "*.psm1", "*.psd1", "*.vbs", "*.wsf", "*.cmd", "*.com",
"*.csv", "*.tsv", "*.tab", "*.dat", "*.data",
"*.man", "*.1", "*.2", "*.3", "*.4", "*.5", "*.6", "*.7", "*.8", "*.9", "*.n", "*.l", "*.m", "*.r", "*.t", "*.x", "*.y", "*.z",
"*.diff", "*.patch", "*.gitignore", "*.dockerignore", "*.editorconfig", "*.gitattributes",
"*.env", "*.env.local", "*.env.development", "*.env.production", "*.env.test",
"*.lock", "*.lockb", "*.lockfile", "*.pkg-lock", "*.yarn-lock"
]
document_files = [] document_files = []
for ext in supported_extensions: for ext in supported_extensions:
document_files.extend(list(testdata_path.glob(ext))) document_files.extend(list(testdata_path.glob(ext)))
@ -164,6 +181,8 @@ async def process_documents_and_generate_summary():
mime_type = "text/html" mime_type = "text/html"
elif doc_file.suffix.lower() == '.csv': elif doc_file.suffix.lower() == '.csv':
mime_type = "text/csv" mime_type = "text/csv"
elif doc_file.suffix.lower() == '.json':
mime_type = "application/json"
elif doc_file.suffix.lower() in ['.txt', '.md']: elif doc_file.suffix.lower() in ['.txt', '.md']:
mime_type = "text/plain" mime_type = "text/plain"
@ -199,7 +218,7 @@ async def process_documents_and_generate_summary():
# Run a single end-to-end test to avoid the loop issue # Run a single end-to-end test to avoid the loop issue
logger.info("🧪 Running single end-to-end test...") logger.info("🧪 Running single end-to-end test...")
userPrompt = "Analyze these documents and create a comprehensive summary for all input documents, each input document in a separate chapter summarized in 10-20 sentences." userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
# userPrompt = "Analyze these documents and create a fitting image for the content" # userPrompt = "Analyze these documents and create a fitting image for the content"
@ -215,8 +234,8 @@ async def process_documents_and_generate_summary():
prompt=userPrompt, prompt=userPrompt,
documents=documents, documents=documents,
options=ai_options, options=ai_options,
outputFormat="docx", outputFormat="txt",
title="Formulaire" title="Kunden und Use Cases"
) )
logger.info(f"✅ End-to-end test completed successfully") logger.info(f"✅ End-to-end test completed successfully")

117
test_extractor_formats.py Normal file
View file

@ -0,0 +1,117 @@
#!/usr/bin/env python3
"""
Test script to demonstrate enhanced extractor format support.
Shows all supported file extensions and MIME types for each extractor.
"""
import sys
import os
from pathlib import Path
# Add the gateway module to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
from modules.services.serviceExtraction.subRegistry import ExtractorRegistry
def test_extractor_formats():
"""Test and display all supported formats from extractors."""
print("🔍 Testing Plug-and-Play Extractor System")
print("=" * 60)
# Create registry
registry = ExtractorRegistry()
# Get all supported formats
formats = registry.getAllSupportedFormats()
print("\n📋 Supported File Extensions by Extractor:")
print("-" * 50)
for extractor_name, extensions in formats["extensions"].items():
if extensions:
print(f" {extractor_name:20}{', '.join(extensions)}")
else:
print(f" {extractor_name:20} → (all extensions - fallback)")
print("\n📋 Supported MIME Types by Extractor:")
print("-" * 50)
for extractor_name, mime_types in formats["mime_types"].items():
if mime_types:
print(f" {extractor_name:20}{', '.join(mime_types)}")
else:
print(f" {extractor_name:20} → (all MIME types - fallback)")
# Test individual extractors
print("\n🧪 Testing Individual Extractors:")
print("-" * 50)
# Get all registered extractors
for key, extractor in registry._map.items():
if hasattr(extractor, 'getSupportedExtensions') and hasattr(extractor, 'getSupportedMimeTypes'):
extensions = extractor.getSupportedExtensions()
mime_types = extractor.getSupportedMimeTypes()
print(f"\n {extractor.__class__.__name__}:")
print(f" Extensions: {extensions}")
print(f" MIME Types: {mime_types}")
# Test detection with various file types
print("\n🔬 Testing File Detection:")
print("-" * 50)
test_files = [
# Document formats
("document.pdf", "application/pdf"),
("spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
("presentation.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
("document.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
# Text and code files
("readme.txt", "text/plain"),
("readme.md", "text/markdown"),
("app.log", "text/plain"),
("Main.java", "text/x-java-source"),
("script.js", "text/javascript"),
("component.tsx", "text/typescript"),
("main.py", "text/x-python"),
("config.yaml", "text/x-yaml"),
("package.json", "application/json"),
("data.csv", "text/csv"),
("config.xml", "application/xml"),
("webpage.html", "text/html"),
("styles.css", "text/css"),
("script.sh", "text/x-sh"),
("Dockerfile", "text/plain"),
(".gitignore", "text/plain"),
("app.config", "text/plain"),
("database.sql", "text/x-sql"),
("schema.ddl", "application/sql"),
# Images
("image.png", "image/png"),
("photo.jpg", "image/jpeg"),
# Unknown
("unknown.xyz", "application/octet-stream")
]
for filename, mime_type in test_files:
extractor = registry.resolve(mime_type, filename)
if extractor:
print(f" {filename:25} ({mime_type:50}) → {extractor.__class__.__name__}")
else:
print(f" {filename:25} ({mime_type:50}) → No extractor found")
print("\n✅ Plug-and-Play extractor system test completed!")
print("\nKey improvements:")
print(" • 🔌 TRUE PLUG-AND-PLAY: Just add extractor file, it auto-registers!")
print(" • 📋 No more manual registration of file types")
print(" • 🔍 Auto-discovery scans extractors directory")
print(" • 📝 Each extractor declares its own supported formats")
print(" • 🚀 Easy to add new file types - just create new extractor")
print(" • 🧹 Clean, maintainable code with no redundancy")
print("\nTo add a new file type:")
print(" 1. Create extractorXyz.py in extractors/ directory")
print(" 2. Implement Extractor interface with getSupportedExtensions()")
print(" 3. That's it! No registry changes needed!")
if __name__ == "__main__":
test_extractor_formats()