AI system tested for all file types

2025-10-13 22:03:28 +02:00 · 2025-10-13 22:03:28 +02:00 · 0c357dc8a9
commit 0c357dc8a9
parent 2e471ca3f7
15 changed files with 588 additions and 52 deletions
--- a/modules/services/serviceExtraction/extractors/extractorBinary.py
+++ b/modules/services/serviceExtraction/extractors/extractorBinary.py
@ -7,8 +7,28 @@ from ..subRegistry import Extractor
 class BinaryExtractor(Extractor):
    """
    Fallback extractor for unsupported file types.
    This extractor handles any file type that doesn't match other extractors.
    It encodes the file as base64 and marks it as binary data.
    Supported formats:
    - All file types (fallback)
    - MIME types: application/octet-stream (default)
    - File extensions: All (fallback)
    """
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return True
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions (all)."""
        return []  # Accepts all extensions as fallback
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types (all)."""
        return []  # Accepts all MIME types as fallback
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        mimeType = context.get("mimeType") or "application/octet-stream"
--- a/modules/services/serviceExtraction/extractors/extractorCsv.py
+++ b/modules/services/serviceExtraction/extractors/extractorCsv.py
@ -6,8 +6,25 @@ from ..subRegistry import Extractor
 class CsvExtractor(Extractor):
    """
    Extractor for CSV files.
    Supported formats:
    - MIME types: text/csv
    - File extensions: .csv
    - Special handling: Treats as table data
    """
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [".csv"]
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return ["text/csv"]
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        fileName = context.get("fileName")
--- a/modules/services/serviceExtraction/extractors/extractorDocx.py
+++ b/modules/services/serviceExtraction/extractors/extractorDocx.py
@ -7,6 +7,16 @@ from ..subRegistry import Extractor
 class DocxExtractor(Extractor):
    """
    Extractor for Microsoft Word documents.
    Supported formats:
    - MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
    - File extensions: .docx
    - Special handling: Extracts paragraphs and tables (converts tables to CSV)
    - Dependencies: python-docx
    """
    def __init__(self):
        self._loaded = False
        self._haveLibs = False
@ -24,6 +34,14 @@ class DocxExtractor(Extractor):
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [".docx"]
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        self._load()
--- a/modules/services/serviceExtraction/extractors/extractorHtml.py
+++ b/modules/services/serviceExtraction/extractors/extractorHtml.py
@ -7,8 +7,26 @@ from ..subRegistry import Extractor
 class HtmlExtractor(Extractor):
    """
    Extractor for HTML files.
    Supported formats:
    - MIME types: text/html
    - File extensions: .html, .htm
    - Special handling: Uses BeautifulSoup for parsing
    - Dependencies: beautifulsoup4
    """
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [".html", ".htm"]
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return ["text/html"]
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        mimeType = context.get("mimeType") or "text/html"
--- a/modules/services/serviceExtraction/extractors/extractorImage.py
+++ b/modules/services/serviceExtraction/extractors/extractorImage.py
@ -10,8 +10,26 @@ logger = logging.getLogger(__name__)
 class ImageExtractor(Extractor):
    """
    Extractor for image files.
    Supported formats:
    - MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
    - File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
    - Special handling: GIF files are converted to PNG during extraction
    """
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
-        return (mimeType or "").startswith("image/")
+        return ((mimeType or "").startswith("image/") or 
                (fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        mimeType = context.get("mimeType") or "image/unknown"
--- a/modules/services/serviceExtraction/extractors/extractorJson.py
+++ b/modules/services/serviceExtraction/extractors/extractorJson.py
@ -7,8 +7,25 @@ from ..subRegistry import Extractor
 class JsonExtractor(Extractor):
    """
    Extractor for JSON files.
    Supported formats:
    - MIME types: application/json
    - File extensions: .json
    - Special handling: Validates JSON format, falls back to text if invalid
    """
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [".json"]
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return ["application/json"]
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        mimeType = context.get("mimeType") or "application/json"
--- a/modules/services/serviceExtraction/extractors/extractorPdf.py
+++ b/modules/services/serviceExtraction/extractors/extractorPdf.py
@ -8,6 +8,16 @@ from ..subRegistry import Extractor
 class PdfExtractor(Extractor):
    """
    Extractor for PDF files.
    Supported formats:
    - MIME types: application/pdf
    - File extensions: .pdf
    - Special handling: Extracts text per page and embedded images
    - Dependencies: PyPDF2, PyMuPDF (fitz)
    """
    def __init__(self):
        self._loaded = False
        self._haveLibs = False
@ -26,6 +36,14 @@ class PdfExtractor(Extractor):
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [".pdf"]
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return ["application/pdf"]
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        self._load()
--- a/modules/services/serviceExtraction/extractors/extractorPptx.py
+++ b/modules/services/serviceExtraction/extractors/extractorPptx.py
@ -8,7 +8,15 @@ logger = logging.getLogger(__name__)
 class PptxExtractor(Extractor):
-    """Extractor for PowerPoint (.pptx) files using python-pptx library."""
+    """
    Extractor for PowerPoint files.
    Supported formats:
    - MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
    - File extensions: .pptx, .ppt
    - Special handling: Extracts slide content, tables, and images
    - Dependencies: python-pptx
    """
    def __init__(self):
        self._loaded = False
@ -31,6 +39,17 @@ class PptxExtractor(Extractor):
            "application/vnd.ms-powerpoint"
        ]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [".pptx", ".ppt"]
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return [
            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
            "application/vnd.ms-powerpoint"
        ]
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        """
        Extract content from PowerPoint files.
--- a/modules/services/serviceExtraction/extractors/extractorSql.py
+++ b/modules/services/serviceExtraction/extractors/extractorSql.py
@ -0,0 +1,56 @@
 from typing import Any, Dict, List
 from modules.datamodels.datamodelExtraction import ContentPart
 from ..subUtils import makeId
 from ..subRegistry import Extractor
 class SqlExtractor(Extractor):
    """
    Extractor for SQL files.
    Supported formats:
    - MIME types: text/x-sql, application/sql
    - File extensions: .sql, .ddl, .dml, .dcl, .tcl
    - Special handling: Treats as structured text with SQL syntax
    """
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return (mimeType in ("text/x-sql", "application/sql") or 
                (fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return ["text/x-sql", "application/sql"]
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        fileName = context.get("fileName")
        mimeType = context.get("mimeType") or "text/x-sql"
        data = fileBytes.decode("utf-8", errors="replace")
        # Add SQL-specific metadata
        metadata = {
            "size": len(fileBytes),
            "file_type": "sql",
            "line_count": len(data.splitlines()),
            "has_select": "SELECT" in data.upper(),
            "has_insert": "INSERT" in data.upper(),
            "has_update": "UPDATE" in data.upper(),
            "has_delete": "DELETE" in data.upper(),
            "has_create": "CREATE" in data.upper(),
            "has_drop": "DROP" in data.upper()
        }
        return [ContentPart(
            id=makeId(),
            parentId=None,
            label="main",
            typeGroup="structure",
            mimeType=mimeType,
            data=data,
            metadata=metadata
        )]
--- a/modules/services/serviceExtraction/extractors/extractorText.py
+++ b/modules/services/serviceExtraction/extractors/extractorText.py
@ -6,8 +6,85 @@ from ..subRegistry import Extractor
 class TextExtractor(Extractor):
    """
    Extractor for plain text files and code files.
    Supported formats:
    - MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
    - File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
    """
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
-        return mimeType in ("text/plain", "text/markdown")
+        # Check MIME types
        if mimeType and mimeType.startswith("text/"):
            return True
        # Check file extensions
        if fileName:
            ext = fileName.lower()
            return ext.endswith((
                # Basic text files
                ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
                # Programming languages
                ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
                ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
                # Web technologies
                ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
                # Configuration files
                ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
                # Scripts and automation
                ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
                # Data files
                ".csv", ".tsv", ".tab", ".dat", ".data",
                # Documentation
                ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
                # Other text formats
                ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
                ".env", ".env.local", ".env.development", ".env.production", ".env.test",
                ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
            ))
        return False
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [
            # Basic text files
            ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
            # Programming languages
            ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
            ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
            # Web technologies
            ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
            # Configuration files
            ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
            # Scripts and automation
            ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
            # Data files
            ".csv", ".tsv", ".tab", ".dat", ".data",
            # Documentation
            ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
            # Other text formats
            ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
            ".env", ".env.local", ".env.development", ".env.production", ".env.test",
            ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
        ]
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return [
            "text/plain", "text/markdown", "text/x-python", "text/x-java-source", 
            "text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
            "text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
            "text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
            "text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
            "text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
            "text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
            "text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
            "text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
            "application/x-yaml", "application/x-toml", "application/x-ini",
            "application/x-config", "application/x-properties", "application/x-log"
        ]
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        fileName = context.get("fileName")
--- a/modules/services/serviceExtraction/extractors/extractorXlsx.py
+++ b/modules/services/serviceExtraction/extractors/extractorXlsx.py
@ -8,6 +8,16 @@ from ..subRegistry import Extractor
 class XlsxExtractor(Extractor):
    """
    Extractor for Microsoft Excel spreadsheets.
    Supported formats:
    - MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
    - File extensions: .xlsx, .xlsm
    - Special handling: Extracts all sheets as CSV data
    - Dependencies: openpyxl
    """
    def __init__(self):
        self._loaded = False
        self._haveLibs = False
@ -26,6 +36,14 @@ class XlsxExtractor(Extractor):
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [".xlsx", ".xlsm"]
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        self._load()
--- a/modules/services/serviceExtraction/extractors/extractorXml.py
+++ b/modules/services/serviceExtraction/extractors/extractorXml.py
@ -7,8 +7,25 @@ from ..subRegistry import Extractor
 class XmlExtractor(Extractor):
    """
    Extractor for XML files.
    Supported formats:
    - MIME types: application/xml
    - File extensions: .xml, .rss, .atom
    - Special handling: Uses ElementTree for parsing
    """
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [".xml", ".rss", ".atom"]
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return ["application/xml"]
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        mimeType = context.get("mimeType") or "application/xml"
--- a/modules/services/serviceExtraction/subRegistry.py
+++ b/modules/services/serviceExtraction/subRegistry.py
@ -7,11 +7,31 @@ logger = logging.getLogger(__name__)
 class Extractor:
    """
    Base class for all document extractors.
    Each extractor should implement:
    - detect(): Check if this extractor can handle the given file
    - extract(): Extract content from the file
    - getSupportedExtensions(): Return supported file extensions
    - getSupportedMimeTypes(): Return supported MIME types
    """
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        """Check if this extractor can handle the given file."""
        return False
    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
        """Extract content from the file bytes."""
        raise NotImplementedError
    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions (including dots)."""
        return []
    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return []
 class Chunker:
@ -23,55 +43,85 @@ class ExtractorRegistry:
    def __init__(self):
        self._map: Dict[str, Extractor] = {}
        self._fallback: Optional[Extractor] = None
-        # Register built-ins
+        self._auto_discover_extractors()
    def _auto_discover_extractors(self):
        """Auto-discover and register all extractors from the extractors directory."""
        try:
-            from .extractors.extractorText import TextExtractor
+            import os
-            from .extractors.extractorCsv import CsvExtractor
+            import importlib
-            from .extractors.extractorJson import JsonExtractor
+            from pathlib import Path
-            from .extractors.extractorXml import XmlExtractor
+            
-            from .extractors.extractorHtml import HtmlExtractor
+            # Get the extractors directory
-            from .extractors.extractorPdf import PdfExtractor
+            current_dir = Path(__file__).parent
-            from .extractors.extractorDocx import DocxExtractor
+            extractors_dir = current_dir / "extractors"
-            from .extractors.extractorXlsx import XlsxExtractor
+            
-            from .extractors.extractorPptx import PptxExtractor
+            if not extractors_dir.exists():
-            from .extractors.extractorImage import ImageExtractor
+                logger.error(f"Extractors directory not found: {extractors_dir}")
-            from .extractors.extractorBinary import BinaryExtractor
+                return
-            self.register("text/plain", TextExtractor())
+            
-            self.register("text/markdown", TextExtractor())
+            # Import all extractor modules
-            self.register("text/csv", CsvExtractor())
+            extractor_modules = []
-            self.register("application/json", JsonExtractor())
+            for file_path in extractors_dir.glob("extractor*.py"):
-            self.register("application/xml", XmlExtractor())
+                if file_path.name == "__init__.py":
-            self.register("text/html", HtmlExtractor())
+                    continue
-            self.register("application/pdf", PdfExtractor())
+                
-            self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
+                module_name = file_path.stem
-            self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
+                try:
-            self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor())
+                    # Import the module
-            self.register("application/vnd.ms-powerpoint", PptxExtractor())
+                    module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
-            # images
+                    
-            self.register("image/jpeg", ImageExtractor())
+                    # Find all extractor classes in the module
-            self.register("image/png", ImageExtractor())
+                    for attr_name in dir(module):
-            self.register("image/gif", ImageExtractor())
+                        attr = getattr(module, attr_name)
-            # extension fallbacks
+                        if (isinstance(attr, type) and 
-            self.register("txt", TextExtractor())
+                            issubclass(attr, Extractor) and 
-            self.register("md", TextExtractor())
+                            attr != Extractor and
-            self.register("csv", CsvExtractor())
+                            not attr_name.startswith('_')):
-            self.register("json", JsonExtractor())
+                            
-            self.register("xml", XmlExtractor())
+                            # Create instance and auto-register
-            self.register("html", HtmlExtractor())
+                            extractor_instance = attr()
-            self.register("htm", HtmlExtractor())
+                            self._auto_register_extractor(extractor_instance)
-            self.register("pdf", PdfExtractor())
+                            extractor_modules.append(attr_name)
-            self.register("docx", DocxExtractor())
+                            
-            self.register("xlsx", XlsxExtractor())
+                except Exception as e:
-            self.register("xlsm", XlsxExtractor())
+                    logger.warning(f"Failed to import {module_name}: {str(e)}")
-            self.register("pptx", PptxExtractor())
+                    continue
-            self.register("ppt", PptxExtractor())
+            
-            # fallback
+            # Set fallback extractor
-            self.setFallback(BinaryExtractor())
+            try:
-            logger.info(f"ExtractorRegistry: Successfully registered {len(self._map)} extractors")
+                from .extractors.extractorBinary import BinaryExtractor
                self.setFallback(BinaryExtractor())
            except Exception as e:
                logger.warning(f"Failed to set fallback extractor: {str(e)}")
            logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
            logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
        except Exception as e:
-            logger.error(f"ExtractorRegistry: Failed to register extractors: {str(e)}")
+            logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
            import traceback
            traceback.print_exc()
    def _auto_register_extractor(self, extractor: Extractor):
        """Auto-register an extractor based on its declared supported formats."""
        try:
            # Register MIME types
            mime_types = extractor.getSupportedMimeTypes()
            for mime_type in mime_types:
                self.register(mime_type, extractor)
                logger.debug(f"Registered MIME type: {mime_type} → {extractor.__class__.__name__}")
            # Register file extensions
            extensions = extractor.getSupportedExtensions()
            for ext in extensions:
                # Remove leading dot for registry key
                ext_key = ext.lstrip('.')
                self.register(ext_key, extractor)
                logger.debug(f"Registered extension: .{ext_key} → {extractor.__class__.__name__}")
        except Exception as e:
            logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
    def register(self, key: str, extractor: Extractor):
        self._map[key] = extractor
@ -88,6 +138,43 @@ class ExtractorRegistry:
            if ext in self._map:
                return self._map[ext]
        return self._fallback
    def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
        """
        Get all supported formats from all registered extractors.
        Returns:
            Dictionary with format information:
            {
                "extensions": {
                    "extractor_name": [".ext1", ".ext2", ...]
                },
                "mime_types": {
                    "extractor_name": ["mime/type1", "mime/type2", ...]
                }
            }
        """
        formats = {"extensions": {}, "mime_types": {}}
        # Get formats from registered extractors
        for key, extractor in self._map.items():
            if hasattr(extractor, 'getSupportedExtensions'):
                extensions = extractor.getSupportedExtensions()
                if extensions:
                    formats["extensions"][key] = extensions
            if hasattr(extractor, 'getSupportedMimeTypes'):
                mime_types = extractor.getSupportedMimeTypes()
                if mime_types:
                    formats["mime_types"][key] = mime_types
        # Add fallback extractor info
        if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
            formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
        if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
            formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
        return formats
 class ChunkerRegistry:
--- a/test_document_processing.py
+++ b/test_document_processing.py
@ -51,7 +51,24 @@ async def process_documents_and_generate_summary():
                return False
    # Find all supported document files
-    supported_extensions = ["*.pdf", "*.jpg", "*.jpeg", "*.png", "*.gif", "*.docx", "*.xlsx", "*.pptx", "*.ppt", "*.txt", "*.md", "*.html", "*.csv"]
+    supported_extensions = [
        # Document formats
        "*.pdf", "*.docx", "*.xlsx", "*.pptx", "*.ppt",
        # Image formats
        "*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.bmp", "*.tiff",
        # Text and code files
        "*.txt", "*.md", "*.log", "*.rtf", "*.tex", "*.rst", "*.adoc", "*.org", "*.pod",
        "*.java", "*.js", "*.jsx", "*.ts", "*.tsx", "*.py", "*.rb", "*.go", "*.rs", "*.cpp", "*.c", "*.h", "*.hpp", "*.cc", "*.cxx",
        "*.cs", "*.php", "*.swift", "*.kt", "*.scala", "*.clj", "*.hs", "*.ml", "*.fs", "*.vb", "*.dart", "*.r", "*.m", "*.pl", "*.sh",
        "*.html", "*.htm", "*.css", "*.scss", "*.sass", "*.less", "*.vue", "*.svelte",
        "*.config", "*.ini", "*.cfg", "*.conf", "*.properties", "*.yaml", "*.yml", "*.toml", "*.json", "*.xml",
        "*.bat", "*.ps1", "*.psm1", "*.psd1", "*.vbs", "*.wsf", "*.cmd", "*.com",
        "*.csv", "*.tsv", "*.tab", "*.dat", "*.data",
        "*.man", "*.1", "*.2", "*.3", "*.4", "*.5", "*.6", "*.7", "*.8", "*.9", "*.n", "*.l", "*.m", "*.r", "*.t", "*.x", "*.y", "*.z",
        "*.diff", "*.patch", "*.gitignore", "*.dockerignore", "*.editorconfig", "*.gitattributes",
        "*.env", "*.env.local", "*.env.development", "*.env.production", "*.env.test",
        "*.lock", "*.lockb", "*.lockfile", "*.pkg-lock", "*.yarn-lock"
    ]
    document_files = []
    for ext in supported_extensions:
        document_files.extend(list(testdata_path.glob(ext)))
@ -164,6 +181,8 @@ async def process_documents_and_generate_summary():
                mime_type = "text/html"
            elif doc_file.suffix.lower() == '.csv':
                mime_type = "text/csv"
            elif doc_file.suffix.lower() == '.json':
                mime_type = "application/json"
            elif doc_file.suffix.lower() in ['.txt', '.md']:
                mime_type = "text/plain"
@ -199,7 +218,7 @@ async def process_documents_and_generate_summary():
        # Run a single end-to-end test to avoid the loop issue
        logger.info("🧪 Running single end-to-end test...")
-        userPrompt = "Analyze these documents and create a comprehensive summary for all input documents, each input document in a separate chapter summarized in 10-20 sentences."
+        userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
        # userPrompt = "Analyze these documents and create a fitting image for the content"
@ -215,8 +234,8 @@ async def process_documents_and_generate_summary():
                prompt=userPrompt,
                documents=documents,
                options=ai_options,
-                outputFormat="docx",
+                outputFormat="txt",
-                title="Formulaire"
+                title="Kunden und Use Cases"
            )
            logger.info(f"✅ End-to-end test completed successfully")
--- a/test_extractor_formats.py
+++ b/test_extractor_formats.py
@ -0,0 +1,117 @@
 #!/usr/bin/env python3
 """
 Test script to demonstrate enhanced extractor format support.
 Shows all supported file extensions and MIME types for each extractor.
 """
 import sys
 import os
 from pathlib import Path
 # Add the gateway module to the path
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
 from modules.services.serviceExtraction.subRegistry import ExtractorRegistry
 def test_extractor_formats():
    """Test and display all supported formats from extractors."""
    print("🔍 Testing Plug-and-Play Extractor System")
    print("=" * 60)
    # Create registry
    registry = ExtractorRegistry()
    # Get all supported formats
    formats = registry.getAllSupportedFormats()
    print("\n📋 Supported File Extensions by Extractor:")
    print("-" * 50)
    for extractor_name, extensions in formats["extensions"].items():
        if extensions:
            print(f"  {extractor_name:20} → {', '.join(extensions)}")
        else:
            print(f"  {extractor_name:20} → (all extensions - fallback)")
    print("\n📋 Supported MIME Types by Extractor:")
    print("-" * 50)
    for extractor_name, mime_types in formats["mime_types"].items():
        if mime_types:
            print(f"  {extractor_name:20} → {', '.join(mime_types)}")
        else:
            print(f"  {extractor_name:20} → (all MIME types - fallback)")
    # Test individual extractors
    print("\n🧪 Testing Individual Extractors:")
    print("-" * 50)
    # Get all registered extractors
    for key, extractor in registry._map.items():
        if hasattr(extractor, 'getSupportedExtensions') and hasattr(extractor, 'getSupportedMimeTypes'):
            extensions = extractor.getSupportedExtensions()
            mime_types = extractor.getSupportedMimeTypes()
            print(f"\n  {extractor.__class__.__name__}:")
            print(f"    Extensions: {extensions}")
            print(f"    MIME Types: {mime_types}")
    # Test detection with various file types
    print("\n🔬 Testing File Detection:")
    print("-" * 50)
    test_files = [
        # Document formats
        ("document.pdf", "application/pdf"),
        ("spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
        ("presentation.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
        ("document.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
        # Text and code files
        ("readme.txt", "text/plain"),
        ("readme.md", "text/markdown"),
        ("app.log", "text/plain"),
        ("Main.java", "text/x-java-source"),
        ("script.js", "text/javascript"),
        ("component.tsx", "text/typescript"),
        ("main.py", "text/x-python"),
        ("config.yaml", "text/x-yaml"),
        ("package.json", "application/json"),
        ("data.csv", "text/csv"),
        ("config.xml", "application/xml"),
        ("webpage.html", "text/html"),
        ("styles.css", "text/css"),
        ("script.sh", "text/x-sh"),
        ("Dockerfile", "text/plain"),
        (".gitignore", "text/plain"),
        ("app.config", "text/plain"),
        ("database.sql", "text/x-sql"),
        ("schema.ddl", "application/sql"),
        # Images
        ("image.png", "image/png"),
        ("photo.jpg", "image/jpeg"),
        # Unknown
        ("unknown.xyz", "application/octet-stream")
    ]
    for filename, mime_type in test_files:
        extractor = registry.resolve(mime_type, filename)
        if extractor:
            print(f"  {filename:25} ({mime_type:50}) → {extractor.__class__.__name__}")
        else:
            print(f"  {filename:25} ({mime_type:50}) → No extractor found")
    print("\n✅ Plug-and-Play extractor system test completed!")
    print("\nKey improvements:")
    print("  • 🔌 TRUE PLUG-AND-PLAY: Just add extractor file, it auto-registers!")
    print("  • 📋 No more manual registration of file types")
    print("  • 🔍 Auto-discovery scans extractors directory")
    print("  • 📝 Each extractor declares its own supported formats")
    print("  • 🚀 Easy to add new file types - just create new extractor")
    print("  • 🧹 Clean, maintainable code with no redundancy")
    print("\nTo add a new file type:")
    print("  1. Create extractorXyz.py in extractors/ directory")
    print("  2. Implement Extractor interface with getSupportedExtensions()")
    print("  3. That's it! No registry changes needed!")
 if __name__ == "__main__":
    test_extractor_formats()