AI system tested for all file types

2025-10-13 22:03:28 +02:00 · 2025-10-13 22:03:28 +02:00 · 0c357dc8a9
commit 0c357dc8a9
parent 2e471ca3f7
15 changed files with 588 additions and 52 deletions
--- a/modules/services/serviceExtraction/extractors/extractorBinary.py
+++ b/modules/services/serviceExtraction/extractors/extractorBinary.py
@ -7,9 +7,29 @@ from ..subRegistry import Extractor


 class BinaryExtractor(Extractor):
+    """
+    Fallback extractor for unsupported file types.
+    
+    This extractor handles any file type that doesn't match other extractors.
+    It encodes the file as base64 and marks it as binary data.
+    
+    Supported formats:
+    - All file types (fallback)
+    - MIME types: application/octet-stream (default)
+    - File extensions: All (fallback)
+    """
+    
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return True
    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions (all)."""
+        return []  # Accepts all extensions as fallback
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types (all)."""
+        return []  # Accepts all MIME types as fallback
+
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        mimeType = context.get("mimeType") or "application/octet-stream"
        return [ContentPart(
--- a/modules/services/serviceExtraction/extractors/extractorCsv.py
+++ b/modules/services/serviceExtraction/extractors/extractorCsv.py
@ -6,9 +6,26 @@ from ..subRegistry import Extractor


 class CsvExtractor(Extractor):
+    """
+    Extractor for CSV files.
+    
+    Supported formats:
+    - MIME types: text/csv
+    - File extensions: .csv
+    - Special handling: Treats as table data
+    """
+    
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions."""
+        return [".csv"]
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return ["text/csv"]
+
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        fileName = context.get("fileName")
        mimeType = context.get("mimeType") or "text/csv"
--- a/modules/services/serviceExtraction/extractors/extractorDocx.py
+++ b/modules/services/serviceExtraction/extractors/extractorDocx.py
@ -7,6 +7,16 @@ from ..subRegistry import Extractor


 class DocxExtractor(Extractor):
+    """
+    Extractor for Microsoft Word documents.
+    
+    Supported formats:
+    - MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
+    - File extensions: .docx
+    - Special handling: Extracts paragraphs and tables (converts tables to CSV)
+    - Dependencies: python-docx
+    """
+    
    def __init__(self):
        self._loaded = False
        self._haveLibs = False
@ -25,6 +35,14 @@ class DocxExtractor(Extractor):
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions."""
+        return [".docx"]
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
+
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        self._load()
        parts: List[ContentPart] = []
--- a/modules/services/serviceExtraction/extractors/extractorHtml.py
+++ b/modules/services/serviceExtraction/extractors/extractorHtml.py
@ -7,9 +7,27 @@ from ..subRegistry import Extractor


 class HtmlExtractor(Extractor):
+    """
+    Extractor for HTML files.
+    
+    Supported formats:
+    - MIME types: text/html
+    - File extensions: .html, .htm
+    - Special handling: Uses BeautifulSoup for parsing
+    - Dependencies: beautifulsoup4
+    """
+    
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions."""
+        return [".html", ".htm"]
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return ["text/html"]
+
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        mimeType = context.get("mimeType") or "text/html"
        text = fileBytes.decode("utf-8", errors="replace")
--- a/modules/services/serviceExtraction/extractors/extractorImage.py
+++ b/modules/services/serviceExtraction/extractors/extractorImage.py
@ -10,8 +10,26 @@ logger = logging.getLogger(__name__)


 class ImageExtractor(Extractor):
+    """
+    Extractor for image files.
+    
+    Supported formats:
+    - MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
+    - File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
+    - Special handling: GIF files are converted to PNG during extraction
+    """
+    
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
-        return (mimeType or "").startswith("image/")
+        return ((mimeType or "").startswith("image/") or 
+                (fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
+    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions."""
+        return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]

    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        mimeType = context.get("mimeType") or "image/unknown"
--- a/modules/services/serviceExtraction/extractors/extractorJson.py
+++ b/modules/services/serviceExtraction/extractors/extractorJson.py
@ -7,9 +7,26 @@ from ..subRegistry import Extractor


 class JsonExtractor(Extractor):
+    """
+    Extractor for JSON files.
+    
+    Supported formats:
+    - MIME types: application/json
+    - File extensions: .json
+    - Special handling: Validates JSON format, falls back to text if invalid
+    """
+    
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions."""
+        return [".json"]
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return ["application/json"]
+
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        mimeType = context.get("mimeType") or "application/json"
        text = fileBytes.decode("utf-8", errors="replace")
--- a/modules/services/serviceExtraction/extractors/extractorPdf.py
+++ b/modules/services/serviceExtraction/extractors/extractorPdf.py
@ -8,6 +8,16 @@ from ..subRegistry import Extractor


 class PdfExtractor(Extractor):
+    """
+    Extractor for PDF files.
+    
+    Supported formats:
+    - MIME types: application/pdf
+    - File extensions: .pdf
+    - Special handling: Extracts text per page and embedded images
+    - Dependencies: PyPDF2, PyMuPDF (fitz)
+    """
+    
    def __init__(self):
        self._loaded = False
        self._haveLibs = False
@ -27,6 +37,14 @@ class PdfExtractor(Extractor):
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions."""
+        return [".pdf"]
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return ["application/pdf"]
+
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        self._load()
        parts: List[ContentPart] = []
--- a/modules/services/serviceExtraction/extractors/extractorPptx.py
+++ b/modules/services/serviceExtraction/extractors/extractorPptx.py
@ -8,7 +8,15 @@ logger = logging.getLogger(__name__)


 class PptxExtractor(Extractor):
-    """Extractor for PowerPoint (.pptx) files using python-pptx library."""
+    """
+    Extractor for PowerPoint files.
+    
+    Supported formats:
+    - MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
+    - File extensions: .pptx, .ppt
+    - Special handling: Extracts slide content, tables, and images
+    - Dependencies: python-pptx
+    """
    
    def __init__(self):
        self._loaded = False
@ -31,6 +39,17 @@ class PptxExtractor(Extractor):
            "application/vnd.ms-powerpoint"
        ]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions."""
+        return [".pptx", ".ppt"]
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return [
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+            "application/vnd.ms-powerpoint"
+        ]
+    
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        """
        Extract content from PowerPoint files.
--- a/modules/services/serviceExtraction/extractors/extractorSql.py
+++ b/modules/services/serviceExtraction/extractors/extractorSql.py
@ -0,0 +1,56 @@
+from typing import Any, Dict, List
+
+from modules.datamodels.datamodelExtraction import ContentPart
+from ..subUtils import makeId
+from ..subRegistry import Extractor
+
+
+class SqlExtractor(Extractor):
+    """
+    Extractor for SQL files.
+    
+    Supported formats:
+    - MIME types: text/x-sql, application/sql
+    - File extensions: .sql, .ddl, .dml, .dcl, .tcl
+    - Special handling: Treats as structured text with SQL syntax
+    """
+    
+    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
+        return (mimeType in ("text/x-sql", "application/sql") or 
+                (fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
+    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions."""
+        return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return ["text/x-sql", "application/sql"]
+
+    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
+        fileName = context.get("fileName")
+        mimeType = context.get("mimeType") or "text/x-sql"
+        data = fileBytes.decode("utf-8", errors="replace")
+        
+        # Add SQL-specific metadata
+        metadata = {
+            "size": len(fileBytes),
+            "file_type": "sql",
+            "line_count": len(data.splitlines()),
+            "has_select": "SELECT" in data.upper(),
+            "has_insert": "INSERT" in data.upper(),
+            "has_update": "UPDATE" in data.upper(),
+            "has_delete": "DELETE" in data.upper(),
+            "has_create": "CREATE" in data.upper(),
+            "has_drop": "DROP" in data.upper()
+        }
+        
+        return [ContentPart(
+            id=makeId(),
+            parentId=None,
+            label="main",
+            typeGroup="structure",
+            mimeType=mimeType,
+            data=data,
+            metadata=metadata
+        )]
--- a/modules/services/serviceExtraction/extractors/extractorText.py
+++ b/modules/services/serviceExtraction/extractors/extractorText.py
@ -6,8 +6,85 @@ from ..subRegistry import Extractor


 class TextExtractor(Extractor):
+    """
+    Extractor for plain text files and code files.
+    
+    Supported formats:
+    - MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
+    - File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
+    """
+    
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
-        return mimeType in ("text/plain", "text/markdown")
+        # Check MIME types
+        if mimeType and mimeType.startswith("text/"):
+            return True
+        
+        # Check file extensions
+        if fileName:
+            ext = fileName.lower()
+            return ext.endswith((
+                # Basic text files
+                ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
+                # Programming languages
+                ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
+                ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
+                # Web technologies
+                ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
+                # Configuration files
+                ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
+                # Scripts and automation
+                ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
+                # Data files
+                ".csv", ".tsv", ".tab", ".dat", ".data",
+                # Documentation
+                ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
+                # Other text formats
+                ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
+                ".env", ".env.local", ".env.development", ".env.production", ".env.test",
+                ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
+            ))
+        
+        return False
+    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions."""
+        return [
+            # Basic text files
+            ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
+            # Programming languages
+            ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
+            ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
+            # Web technologies
+            ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
+            # Configuration files
+            ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
+            # Scripts and automation
+            ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
+            # Data files
+            ".csv", ".tsv", ".tab", ".dat", ".data",
+            # Documentation
+            ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
+            # Other text formats
+            ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
+            ".env", ".env.local", ".env.development", ".env.production", ".env.test",
+            ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
+        ]
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return [
+            "text/plain", "text/markdown", "text/x-python", "text/x-java-source", 
+            "text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
+            "text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
+            "text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
+            "text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
+            "text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
+            "text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
+            "text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
+            "text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
+            "application/x-yaml", "application/x-toml", "application/x-ini",
+            "application/x-config", "application/x-properties", "application/x-log"
+        ]

    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        fileName = context.get("fileName")
--- a/modules/services/serviceExtraction/extractors/extractorXlsx.py
+++ b/modules/services/serviceExtraction/extractors/extractorXlsx.py
@ -8,6 +8,16 @@ from ..subRegistry import Extractor


 class XlsxExtractor(Extractor):
+    """
+    Extractor for Microsoft Excel spreadsheets.
+    
+    Supported formats:
+    - MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
+    - File extensions: .xlsx, .xlsm
+    - Special handling: Extracts all sheets as CSV data
+    - Dependencies: openpyxl
+    """
+    
    def __init__(self):
        self._loaded = False
        self._haveLibs = False
@ -27,6 +37,14 @@ class XlsxExtractor(Extractor):
        mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions."""
+        return [".xlsx", ".xlsm"]
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
+
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        self._load()
        parts: List[ContentPart] = []
--- a/modules/services/serviceExtraction/extractors/extractorXml.py
+++ b/modules/services/serviceExtraction/extractors/extractorXml.py
@ -7,9 +7,26 @@ from ..subRegistry import Extractor


 class XmlExtractor(Extractor):
+    """
+    Extractor for XML files.
+    
+    Supported formats:
+    - MIME types: application/xml
+    - File extensions: .xml, .rss, .atom
+    - Special handling: Uses ElementTree for parsing
+    """
+    
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions."""
+        return [".xml", ".rss", ".atom"]
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return ["application/xml"]
+
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        mimeType = context.get("mimeType") or "application/xml"
        text = fileBytes.decode("utf-8", errors="replace")
--- a/modules/services/serviceExtraction/subRegistry.py
+++ b/modules/services/serviceExtraction/subRegistry.py
@ -7,12 +7,32 @@ logger = logging.getLogger(__name__)


 class Extractor:
+    """
+    Base class for all document extractors.
+    
+    Each extractor should implement:
+    - detect(): Check if this extractor can handle the given file
+    - extract(): Extract content from the file
+    - getSupportedExtensions(): Return supported file extensions
+    - getSupportedMimeTypes(): Return supported MIME types
+    """
+    
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
+        """Check if this extractor can handle the given file."""
        return False

    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
+        """Extract content from the file bytes."""
        raise NotImplementedError
    
+    def getSupportedExtensions(self) -> list[str]:
+        """Return list of supported file extensions (including dots)."""
+        return []
+    
+    def getSupportedMimeTypes(self) -> list[str]:
+        """Return list of supported MIME types."""
+        return []
+

 class Chunker:
    def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
@ -23,56 +43,86 @@ class ExtractorRegistry:
    def __init__(self):
        self._map: Dict[str, Extractor] = {}
        self._fallback: Optional[Extractor] = None
-        # Register built-ins
+        self._auto_discover_extractors()
+    
+    def _auto_discover_extractors(self):
+        """Auto-discover and register all extractors from the extractors directory."""
        try:
-            from .extractors.extractorText import TextExtractor
-            from .extractors.extractorCsv import CsvExtractor
-            from .extractors.extractorJson import JsonExtractor
-            from .extractors.extractorXml import XmlExtractor
-            from .extractors.extractorHtml import HtmlExtractor
-            from .extractors.extractorPdf import PdfExtractor
-            from .extractors.extractorDocx import DocxExtractor
-            from .extractors.extractorXlsx import XlsxExtractor
-            from .extractors.extractorPptx import PptxExtractor
-            from .extractors.extractorImage import ImageExtractor
-            from .extractors.extractorBinary import BinaryExtractor
-            self.register("text/plain", TextExtractor())
-            self.register("text/markdown", TextExtractor())
-            self.register("text/csv", CsvExtractor())
-            self.register("application/json", JsonExtractor())
-            self.register("application/xml", XmlExtractor())
-            self.register("text/html", HtmlExtractor())
-            self.register("application/pdf", PdfExtractor())
-            self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
-            self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
-            self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor())
-            self.register("application/vnd.ms-powerpoint", PptxExtractor())
-            # images
-            self.register("image/jpeg", ImageExtractor())
-            self.register("image/png", ImageExtractor())
-            self.register("image/gif", ImageExtractor())
-            # extension fallbacks
-            self.register("txt", TextExtractor())
-            self.register("md", TextExtractor())
-            self.register("csv", CsvExtractor())
-            self.register("json", JsonExtractor())
-            self.register("xml", XmlExtractor())
-            self.register("html", HtmlExtractor())
-            self.register("htm", HtmlExtractor())
-            self.register("pdf", PdfExtractor())
-            self.register("docx", DocxExtractor())
-            self.register("xlsx", XlsxExtractor())
-            self.register("xlsm", XlsxExtractor())
-            self.register("pptx", PptxExtractor())
-            self.register("ppt", PptxExtractor())
-            # fallback
-            self.setFallback(BinaryExtractor())
-            logger.info(f"ExtractorRegistry: Successfully registered {len(self._map)} extractors")
+            import os
+            import importlib
+            from pathlib import Path
+            
+            # Get the extractors directory
+            current_dir = Path(__file__).parent
+            extractors_dir = current_dir / "extractors"
+            
+            if not extractors_dir.exists():
+                logger.error(f"Extractors directory not found: {extractors_dir}")
+                return
+            
+            # Import all extractor modules
+            extractor_modules = []
+            for file_path in extractors_dir.glob("extractor*.py"):
+                if file_path.name == "__init__.py":
+                    continue
+                
+                module_name = file_path.stem
+                try:
+                    # Import the module
+                    module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
+                    
+                    # Find all extractor classes in the module
+                    for attr_name in dir(module):
+                        attr = getattr(module, attr_name)
+                        if (isinstance(attr, type) and 
+                            issubclass(attr, Extractor) and 
+                            attr != Extractor and
+                            not attr_name.startswith('_')):
+                            
+                            # Create instance and auto-register
+                            extractor_instance = attr()
+                            self._auto_register_extractor(extractor_instance)
+                            extractor_modules.append(attr_name)
+                            
+                except Exception as e:
+                    logger.warning(f"Failed to import {module_name}: {str(e)}")
+                    continue
+            
+            # Set fallback extractor
+            try:
+                from .extractors.extractorBinary import BinaryExtractor
+                self.setFallback(BinaryExtractor())
+            except Exception as e:
+                logger.warning(f"Failed to set fallback extractor: {str(e)}")
+            
+            logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
+            logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
+            
        except Exception as e:
-            logger.error(f"ExtractorRegistry: Failed to register extractors: {str(e)}")
+            logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
            import traceback
            traceback.print_exc()
    
+    def _auto_register_extractor(self, extractor: Extractor):
+        """Auto-register an extractor based on its declared supported formats."""
+        try:
+            # Register MIME types
+            mime_types = extractor.getSupportedMimeTypes()
+            for mime_type in mime_types:
+                self.register(mime_type, extractor)
+                logger.debug(f"Registered MIME type: {mime_type} → {extractor.__class__.__name__}")
+            
+            # Register file extensions
+            extensions = extractor.getSupportedExtensions()
+            for ext in extensions:
+                # Remove leading dot for registry key
+                ext_key = ext.lstrip('.')
+                self.register(ext_key, extractor)
+                logger.debug(f"Registered extension: .{ext_key} → {extractor.__class__.__name__}")
+                
+        except Exception as e:
+            logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
+
    def register(self, key: str, extractor: Extractor):
        self._map[key] = extractor

@ -89,6 +139,43 @@ class ExtractorRegistry:
                return self._map[ext]
        return self._fallback
    
+    def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
+        """
+        Get all supported formats from all registered extractors.
+        
+        Returns:
+            Dictionary with format information:
+            {
+                "extensions": {
+                    "extractor_name": [".ext1", ".ext2", ...]
+                },
+                "mime_types": {
+                    "extractor_name": ["mime/type1", "mime/type2", ...]
+                }
+            }
+        """
+        formats = {"extensions": {}, "mime_types": {}}
+        
+        # Get formats from registered extractors
+        for key, extractor in self._map.items():
+            if hasattr(extractor, 'getSupportedExtensions'):
+                extensions = extractor.getSupportedExtensions()
+                if extensions:
+                    formats["extensions"][key] = extensions
+            
+            if hasattr(extractor, 'getSupportedMimeTypes'):
+                mime_types = extractor.getSupportedMimeTypes()
+                if mime_types:
+                    formats["mime_types"][key] = mime_types
+        
+        # Add fallback extractor info
+        if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
+            formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
+        if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
+            formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
+        
+        return formats
+

 class ChunkerRegistry:
    def __init__(self):
--- a/test_document_processing.py
+++ b/test_document_processing.py
@ -51,7 +51,24 @@ async def process_documents_and_generate_summary():
                return False
    
    # Find all supported document files
-    supported_extensions = ["*.pdf", "*.jpg", "*.jpeg", "*.png", "*.gif", "*.docx", "*.xlsx", "*.pptx", "*.ppt", "*.txt", "*.md", "*.html", "*.csv"]
+    supported_extensions = [
+        # Document formats
+        "*.pdf", "*.docx", "*.xlsx", "*.pptx", "*.ppt",
+        # Image formats
+        "*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.bmp", "*.tiff",
+        # Text and code files
+        "*.txt", "*.md", "*.log", "*.rtf", "*.tex", "*.rst", "*.adoc", "*.org", "*.pod",
+        "*.java", "*.js", "*.jsx", "*.ts", "*.tsx", "*.py", "*.rb", "*.go", "*.rs", "*.cpp", "*.c", "*.h", "*.hpp", "*.cc", "*.cxx",
+        "*.cs", "*.php", "*.swift", "*.kt", "*.scala", "*.clj", "*.hs", "*.ml", "*.fs", "*.vb", "*.dart", "*.r", "*.m", "*.pl", "*.sh",
+        "*.html", "*.htm", "*.css", "*.scss", "*.sass", "*.less", "*.vue", "*.svelte",
+        "*.config", "*.ini", "*.cfg", "*.conf", "*.properties", "*.yaml", "*.yml", "*.toml", "*.json", "*.xml",
+        "*.bat", "*.ps1", "*.psm1", "*.psd1", "*.vbs", "*.wsf", "*.cmd", "*.com",
+        "*.csv", "*.tsv", "*.tab", "*.dat", "*.data",
+        "*.man", "*.1", "*.2", "*.3", "*.4", "*.5", "*.6", "*.7", "*.8", "*.9", "*.n", "*.l", "*.m", "*.r", "*.t", "*.x", "*.y", "*.z",
+        "*.diff", "*.patch", "*.gitignore", "*.dockerignore", "*.editorconfig", "*.gitattributes",
+        "*.env", "*.env.local", "*.env.development", "*.env.production", "*.env.test",
+        "*.lock", "*.lockb", "*.lockfile", "*.pkg-lock", "*.yarn-lock"
+    ]
    document_files = []
    for ext in supported_extensions:
        document_files.extend(list(testdata_path.glob(ext)))
@ -164,6 +181,8 @@ async def process_documents_and_generate_summary():
                mime_type = "text/html"
            elif doc_file.suffix.lower() == '.csv':
                mime_type = "text/csv"
+            elif doc_file.suffix.lower() == '.json':
+                mime_type = "application/json"
            elif doc_file.suffix.lower() in ['.txt', '.md']:
                mime_type = "text/plain"
            
@ -199,7 +218,7 @@ async def process_documents_and_generate_summary():
        # Run a single end-to-end test to avoid the loop issue
        logger.info("🧪 Running single end-to-end test...")
        
-        userPrompt = "Analyze these documents and create a comprehensive summary for all input documents, each input document in a separate chapter summarized in 10-20 sentences."
+        userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"

        # userPrompt = "Analyze these documents and create a fitting image for the content"

@ -215,8 +234,8 @@ async def process_documents_and_generate_summary():
                prompt=userPrompt,
                documents=documents,
                options=ai_options,
-                outputFormat="docx",
-                title="Formulaire"
+                outputFormat="txt",
+                title="Kunden und Use Cases"
            )
            
            logger.info(f"✅ End-to-end test completed successfully")
--- a/test_extractor_formats.py
+++ b/test_extractor_formats.py
@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+Test script to demonstrate enhanced extractor format support.
+Shows all supported file extensions and MIME types for each extractor.
+"""
+
+import sys
+import os
+from pathlib import Path
+
+# Add the gateway module to the path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
+
+from modules.services.serviceExtraction.subRegistry import ExtractorRegistry
+
+def test_extractor_formats():
+    """Test and display all supported formats from extractors."""
+    print("🔍 Testing Plug-and-Play Extractor System")
+    print("=" * 60)
+    
+    # Create registry
+    registry = ExtractorRegistry()
+    
+    # Get all supported formats
+    formats = registry.getAllSupportedFormats()
+    
+    print("\n📋 Supported File Extensions by Extractor:")
+    print("-" * 50)
+    for extractor_name, extensions in formats["extensions"].items():
+        if extensions:
+            print(f"  {extractor_name:20} → {', '.join(extensions)}")
+        else:
+            print(f"  {extractor_name:20} → (all extensions - fallback)")
+    
+    print("\n📋 Supported MIME Types by Extractor:")
+    print("-" * 50)
+    for extractor_name, mime_types in formats["mime_types"].items():
+        if mime_types:
+            print(f"  {extractor_name:20} → {', '.join(mime_types)}")
+        else:
+            print(f"  {extractor_name:20} → (all MIME types - fallback)")
+    
+    # Test individual extractors
+    print("\n🧪 Testing Individual Extractors:")
+    print("-" * 50)
+    
+    # Get all registered extractors
+    for key, extractor in registry._map.items():
+        if hasattr(extractor, 'getSupportedExtensions') and hasattr(extractor, 'getSupportedMimeTypes'):
+            extensions = extractor.getSupportedExtensions()
+            mime_types = extractor.getSupportedMimeTypes()
+            print(f"\n  {extractor.__class__.__name__}:")
+            print(f"    Extensions: {extensions}")
+            print(f"    MIME Types: {mime_types}")
+    
+    # Test detection with various file types
+    print("\n🔬 Testing File Detection:")
+    print("-" * 50)
+    
+    test_files = [
+        # Document formats
+        ("document.pdf", "application/pdf"),
+        ("spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
+        ("presentation.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
+        ("document.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
+        
+        # Text and code files
+        ("readme.txt", "text/plain"),
+        ("readme.md", "text/markdown"),
+        ("app.log", "text/plain"),
+        ("Main.java", "text/x-java-source"),
+        ("script.js", "text/javascript"),
+        ("component.tsx", "text/typescript"),
+        ("main.py", "text/x-python"),
+        ("config.yaml", "text/x-yaml"),
+        ("package.json", "application/json"),
+        ("data.csv", "text/csv"),
+        ("config.xml", "application/xml"),
+        ("webpage.html", "text/html"),
+        ("styles.css", "text/css"),
+        ("script.sh", "text/x-sh"),
+        ("Dockerfile", "text/plain"),
+        (".gitignore", "text/plain"),
+        ("app.config", "text/plain"),
+        ("database.sql", "text/x-sql"),
+        ("schema.ddl", "application/sql"),
+        
+        # Images
+        ("image.png", "image/png"),
+        ("photo.jpg", "image/jpeg"),
+        
+        # Unknown
+        ("unknown.xyz", "application/octet-stream")
+    ]
+    
+    for filename, mime_type in test_files:
+        extractor = registry.resolve(mime_type, filename)
+        if extractor:
+            print(f"  {filename:25} ({mime_type:50}) → {extractor.__class__.__name__}")
+        else:
+            print(f"  {filename:25} ({mime_type:50}) → No extractor found")
+    
+    print("\n✅ Plug-and-Play extractor system test completed!")
+    print("\nKey improvements:")
+    print("  • 🔌 TRUE PLUG-AND-PLAY: Just add extractor file, it auto-registers!")
+    print("  • 📋 No more manual registration of file types")
+    print("  • 🔍 Auto-discovery scans extractors directory")
+    print("  • 📝 Each extractor declares its own supported formats")
+    print("  • 🚀 Easy to add new file types - just create new extractor")
+    print("  • 🧹 Clean, maintainable code with no redundancy")
+    print("\nTo add a new file type:")
+    print("  1. Create extractorXyz.py in extractors/ directory")
+    print("  2. Implement Extractor interface with getSupportedExtensions()")
+    print("  3. That's it! No registry changes needed!")
+
+if __name__ == "__main__":
+    test_extractor_formats()