From 0c357dc8a9549d403915ac7c621992beb2be60c2 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Mon, 13 Oct 2025 22:03:28 +0200 Subject: [PATCH] AI system tested for all file types --- .../extractors/extractorBinary.py | 20 ++ .../extractors/extractorCsv.py | 17 ++ .../extractors/extractorDocx.py | 18 ++ .../extractors/extractorHtml.py | 18 ++ .../extractors/extractorImage.py | 20 +- .../extractors/extractorJson.py | 17 ++ .../extractors/extractorPdf.py | 18 ++ .../extractors/extractorPptx.py | 21 ++- .../extractors/extractorSql.py | 56 ++++++ .../extractors/extractorText.py | 79 +++++++- .../extractors/extractorXlsx.py | 18 ++ .../extractors/extractorXml.py | 17 ++ .../services/serviceExtraction/subRegistry.py | 177 +++++++++++++----- test_document_processing.py | 27 ++- test_extractor_formats.py | 117 ++++++++++++ 15 files changed, 588 insertions(+), 52 deletions(-) create mode 100644 modules/services/serviceExtraction/extractors/extractorSql.py create mode 100644 test_extractor_formats.py diff --git a/modules/services/serviceExtraction/extractors/extractorBinary.py b/modules/services/serviceExtraction/extractors/extractorBinary.py index e6667fda..8a52986c 100644 --- a/modules/services/serviceExtraction/extractors/extractorBinary.py +++ b/modules/services/serviceExtraction/extractors/extractorBinary.py @@ -7,8 +7,28 @@ from ..subRegistry import Extractor class BinaryExtractor(Extractor): + """ + Fallback extractor for unsupported file types. + + This extractor handles any file type that doesn't match other extractors. + It encodes the file as base64 and marks it as binary data. + + Supported formats: + - All file types (fallback) + - MIME types: application/octet-stream (default) + - File extensions: All (fallback) + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return True + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions (all).""" + return [] # Accepts all extensions as fallback + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types (all).""" + return [] # Accepts all MIME types as fallback def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: mimeType = context.get("mimeType") or "application/octet-stream" diff --git a/modules/services/serviceExtraction/extractors/extractorCsv.py b/modules/services/serviceExtraction/extractors/extractorCsv.py index 27233979..fb1c642e 100644 --- a/modules/services/serviceExtraction/extractors/extractorCsv.py +++ b/modules/services/serviceExtraction/extractors/extractorCsv.py @@ -6,8 +6,25 @@ from ..subRegistry import Extractor class CsvExtractor(Extractor): + """ + Extractor for CSV files. + + Supported formats: + - MIME types: text/csv + - File extensions: .csv + - Special handling: Treats as table data + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv") + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".csv"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["text/csv"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: fileName = context.get("fileName") diff --git a/modules/services/serviceExtraction/extractors/extractorDocx.py b/modules/services/serviceExtraction/extractors/extractorDocx.py index 51384ffd..bce9f04b 100644 --- a/modules/services/serviceExtraction/extractors/extractorDocx.py +++ b/modules/services/serviceExtraction/extractors/extractorDocx.py @@ -7,6 +7,16 @@ from ..subRegistry import Extractor class DocxExtractor(Extractor): + """ + Extractor for Microsoft Word documents. + + Supported formats: + - MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document + - File extensions: .docx + - Special handling: Extracts paragraphs and tables (converts tables to CSV) + - Dependencies: python-docx + """ + def __init__(self): self._loaded = False self._haveLibs = False @@ -24,6 +34,14 @@ class DocxExtractor(Extractor): def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx") + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".docx"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: self._load() diff --git a/modules/services/serviceExtraction/extractors/extractorHtml.py b/modules/services/serviceExtraction/extractors/extractorHtml.py index 09da02f4..730df49c 100644 --- a/modules/services/serviceExtraction/extractors/extractorHtml.py +++ b/modules/services/serviceExtraction/extractors/extractorHtml.py @@ -7,8 +7,26 @@ from ..subRegistry import Extractor class HtmlExtractor(Extractor): + """ + Extractor for HTML files. + + Supported formats: + - MIME types: text/html + - File extensions: .html, .htm + - Special handling: Uses BeautifulSoup for parsing + - Dependencies: beautifulsoup4 + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm")) + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".html", ".htm"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["text/html"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: mimeType = context.get("mimeType") or "text/html" diff --git a/modules/services/serviceExtraction/extractors/extractorImage.py b/modules/services/serviceExtraction/extractors/extractorImage.py index 3f94459c..578e0148 100644 --- a/modules/services/serviceExtraction/extractors/extractorImage.py +++ b/modules/services/serviceExtraction/extractors/extractorImage.py @@ -10,8 +10,26 @@ logger = logging.getLogger(__name__) class ImageExtractor(Extractor): + """ + Extractor for image files. + + Supported formats: + - MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff + - File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff + - Special handling: GIF files are converted to PNG during extraction + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: - return (mimeType or "").startswith("image/") + return ((mimeType or "").startswith("image/") or + (fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"))) + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: mimeType = context.get("mimeType") or "image/unknown" diff --git a/modules/services/serviceExtraction/extractors/extractorJson.py b/modules/services/serviceExtraction/extractors/extractorJson.py index 86eac791..04ab1c10 100644 --- a/modules/services/serviceExtraction/extractors/extractorJson.py +++ b/modules/services/serviceExtraction/extractors/extractorJson.py @@ -7,8 +7,25 @@ from ..subRegistry import Extractor class JsonExtractor(Extractor): + """ + Extractor for JSON files. + + Supported formats: + - MIME types: application/json + - File extensions: .json + - Special handling: Validates JSON format, falls back to text if invalid + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "application/json" or (fileName or "").lower().endswith(".json") + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".json"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["application/json"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: mimeType = context.get("mimeType") or "application/json" diff --git a/modules/services/serviceExtraction/extractors/extractorPdf.py b/modules/services/serviceExtraction/extractors/extractorPdf.py index 59c88dc7..4f0290ec 100644 --- a/modules/services/serviceExtraction/extractors/extractorPdf.py +++ b/modules/services/serviceExtraction/extractors/extractorPdf.py @@ -8,6 +8,16 @@ from ..subRegistry import Extractor class PdfExtractor(Extractor): + """ + Extractor for PDF files. + + Supported formats: + - MIME types: application/pdf + - File extensions: .pdf + - Special handling: Extracts text per page and embedded images + - Dependencies: PyPDF2, PyMuPDF (fitz) + """ + def __init__(self): self._loaded = False self._haveLibs = False @@ -26,6 +36,14 @@ class PdfExtractor(Extractor): def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf") + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".pdf"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["application/pdf"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: self._load() diff --git a/modules/services/serviceExtraction/extractors/extractorPptx.py b/modules/services/serviceExtraction/extractors/extractorPptx.py index 096b7925..1a5a7ff8 100644 --- a/modules/services/serviceExtraction/extractors/extractorPptx.py +++ b/modules/services/serviceExtraction/extractors/extractorPptx.py @@ -8,7 +8,15 @@ logger = logging.getLogger(__name__) class PptxExtractor(Extractor): - """Extractor for PowerPoint (.pptx) files using python-pptx library.""" + """ + Extractor for PowerPoint files. + + Supported formats: + - MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint + - File extensions: .pptx, .ppt + - Special handling: Extracts slide content, tables, and images + - Dependencies: python-pptx + """ def __init__(self): self._loaded = False @@ -31,6 +39,17 @@ class PptxExtractor(Extractor): "application/vnd.ms-powerpoint" ]) or (fileName or "").lower().endswith((".pptx", ".ppt")) + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".pptx", ".ppt"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return [ + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.ms-powerpoint" + ] + def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: """ Extract content from PowerPoint files. diff --git a/modules/services/serviceExtraction/extractors/extractorSql.py b/modules/services/serviceExtraction/extractors/extractorSql.py new file mode 100644 index 00000000..c751d7ca --- /dev/null +++ b/modules/services/serviceExtraction/extractors/extractorSql.py @@ -0,0 +1,56 @@ +from typing import Any, Dict, List + +from modules.datamodels.datamodelExtraction import ContentPart +from ..subUtils import makeId +from ..subRegistry import Extractor + + +class SqlExtractor(Extractor): + """ + Extractor for SQL files. + + Supported formats: + - MIME types: text/x-sql, application/sql + - File extensions: .sql, .ddl, .dml, .dcl, .tcl + - Special handling: Treats as structured text with SQL syntax + """ + + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: + return (mimeType in ("text/x-sql", "application/sql") or + (fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl"))) + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".sql", ".ddl", ".dml", ".dcl", ".tcl"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["text/x-sql", "application/sql"] + + def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: + fileName = context.get("fileName") + mimeType = context.get("mimeType") or "text/x-sql" + data = fileBytes.decode("utf-8", errors="replace") + + # Add SQL-specific metadata + metadata = { + "size": len(fileBytes), + "file_type": "sql", + "line_count": len(data.splitlines()), + "has_select": "SELECT" in data.upper(), + "has_insert": "INSERT" in data.upper(), + "has_update": "UPDATE" in data.upper(), + "has_delete": "DELETE" in data.upper(), + "has_create": "CREATE" in data.upper(), + "has_drop": "DROP" in data.upper() + } + + return [ContentPart( + id=makeId(), + parentId=None, + label="main", + typeGroup="structure", + mimeType=mimeType, + data=data, + metadata=metadata + )] diff --git a/modules/services/serviceExtraction/extractors/extractorText.py b/modules/services/serviceExtraction/extractors/extractorText.py index a6d92bc1..3cd0ebdf 100644 --- a/modules/services/serviceExtraction/extractors/extractorText.py +++ b/modules/services/serviceExtraction/extractors/extractorText.py @@ -6,8 +6,85 @@ from ..subRegistry import Extractor class TextExtractor(Extractor): + """ + Extractor for plain text files and code files. + + Supported formats: + - MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc. + - File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: - return mimeType in ("text/plain", "text/markdown") + # Check MIME types + if mimeType and mimeType.startswith("text/"): + return True + + # Check file extensions + if fileName: + ext = fileName.lower() + return ext.endswith(( + # Basic text files + ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod", + # Programming languages + ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx", + ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh", + # Web technologies + ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte", + # Configuration files + ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml", + # Scripts and automation + ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com", + # Data files + ".csv", ".tsv", ".tab", ".dat", ".data", + # Documentation + ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z", + # Other text formats + ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes", + ".env", ".env.local", ".env.development", ".env.production", ".env.test", + ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock" + )) + + return False + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [ + # Basic text files + ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod", + # Programming languages + ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx", + ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh", + # Web technologies + ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte", + # Configuration files + ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml", + # Scripts and automation + ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com", + # Data files + ".csv", ".tsv", ".tab", ".dat", ".data", + # Documentation + ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z", + # Other text formats + ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes", + ".env", ".env.local", ".env.development", ".env.production", ".env.test", + ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock" + ] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return [ + "text/plain", "text/markdown", "text/x-python", "text/x-java-source", + "text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript", + "text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby", + "text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin", + "text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml", + "text/x-ini", "text/x-config", "text/x-properties", "text/x-log", + "text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less", + "text/xml", "text/csv", "text/tab-separated-values", "text/rtf", + "text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org", + "application/x-yaml", "application/x-toml", "application/x-ini", + "application/x-config", "application/x-properties", "application/x-log" + ] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: fileName = context.get("fileName") diff --git a/modules/services/serviceExtraction/extractors/extractorXlsx.py b/modules/services/serviceExtraction/extractors/extractorXlsx.py index ea6396a2..af346419 100644 --- a/modules/services/serviceExtraction/extractors/extractorXlsx.py +++ b/modules/services/serviceExtraction/extractors/extractorXlsx.py @@ -8,6 +8,16 @@ from ..subRegistry import Extractor class XlsxExtractor(Extractor): + """ + Extractor for Microsoft Excel spreadsheets. + + Supported formats: + - MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet + - File extensions: .xlsx, .xlsm + - Special handling: Extracts all sheets as CSV data + - Dependencies: openpyxl + """ + def __init__(self): self._loaded = False self._haveLibs = False @@ -26,6 +36,14 @@ class XlsxExtractor(Extractor): def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm")) + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".xlsx", ".xlsm"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: self._load() diff --git a/modules/services/serviceExtraction/extractors/extractorXml.py b/modules/services/serviceExtraction/extractors/extractorXml.py index 5aabea35..c7d034ad 100644 --- a/modules/services/serviceExtraction/extractors/extractorXml.py +++ b/modules/services/serviceExtraction/extractors/extractorXml.py @@ -7,8 +7,25 @@ from ..subRegistry import Extractor class XmlExtractor(Extractor): + """ + Extractor for XML files. + + Supported formats: + - MIME types: application/xml + - File extensions: .xml, .rss, .atom + - Special handling: Uses ElementTree for parsing + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom")) + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions.""" + return [".xml", ".rss", ".atom"] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return ["application/xml"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: mimeType = context.get("mimeType") or "application/xml" diff --git a/modules/services/serviceExtraction/subRegistry.py b/modules/services/serviceExtraction/subRegistry.py index ae994bbf..eb2ece4d 100644 --- a/modules/services/serviceExtraction/subRegistry.py +++ b/modules/services/serviceExtraction/subRegistry.py @@ -7,11 +7,31 @@ logger = logging.getLogger(__name__) class Extractor: + """ + Base class for all document extractors. + + Each extractor should implement: + - detect(): Check if this extractor can handle the given file + - extract(): Extract content from the file + - getSupportedExtensions(): Return supported file extensions + - getSupportedMimeTypes(): Return supported MIME types + """ + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: + """Check if this extractor can handle the given file.""" return False def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]: + """Extract content from the file bytes.""" raise NotImplementedError + + def getSupportedExtensions(self) -> list[str]: + """Return list of supported file extensions (including dots).""" + return [] + + def getSupportedMimeTypes(self) -> list[str]: + """Return list of supported MIME types.""" + return [] class Chunker: @@ -23,55 +43,85 @@ class ExtractorRegistry: def __init__(self): self._map: Dict[str, Extractor] = {} self._fallback: Optional[Extractor] = None - # Register built-ins + self._auto_discover_extractors() + + def _auto_discover_extractors(self): + """Auto-discover and register all extractors from the extractors directory.""" try: - from .extractors.extractorText import TextExtractor - from .extractors.extractorCsv import CsvExtractor - from .extractors.extractorJson import JsonExtractor - from .extractors.extractorXml import XmlExtractor - from .extractors.extractorHtml import HtmlExtractor - from .extractors.extractorPdf import PdfExtractor - from .extractors.extractorDocx import DocxExtractor - from .extractors.extractorXlsx import XlsxExtractor - from .extractors.extractorPptx import PptxExtractor - from .extractors.extractorImage import ImageExtractor - from .extractors.extractorBinary import BinaryExtractor - self.register("text/plain", TextExtractor()) - self.register("text/markdown", TextExtractor()) - self.register("text/csv", CsvExtractor()) - self.register("application/json", JsonExtractor()) - self.register("application/xml", XmlExtractor()) - self.register("text/html", HtmlExtractor()) - self.register("application/pdf", PdfExtractor()) - self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor()) - self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor()) - self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor()) - self.register("application/vnd.ms-powerpoint", PptxExtractor()) - # images - self.register("image/jpeg", ImageExtractor()) - self.register("image/png", ImageExtractor()) - self.register("image/gif", ImageExtractor()) - # extension fallbacks - self.register("txt", TextExtractor()) - self.register("md", TextExtractor()) - self.register("csv", CsvExtractor()) - self.register("json", JsonExtractor()) - self.register("xml", XmlExtractor()) - self.register("html", HtmlExtractor()) - self.register("htm", HtmlExtractor()) - self.register("pdf", PdfExtractor()) - self.register("docx", DocxExtractor()) - self.register("xlsx", XlsxExtractor()) - self.register("xlsm", XlsxExtractor()) - self.register("pptx", PptxExtractor()) - self.register("ppt", PptxExtractor()) - # fallback - self.setFallback(BinaryExtractor()) - logger.info(f"ExtractorRegistry: Successfully registered {len(self._map)} extractors") + import os + import importlib + from pathlib import Path + + # Get the extractors directory + current_dir = Path(__file__).parent + extractors_dir = current_dir / "extractors" + + if not extractors_dir.exists(): + logger.error(f"Extractors directory not found: {extractors_dir}") + return + + # Import all extractor modules + extractor_modules = [] + for file_path in extractors_dir.glob("extractor*.py"): + if file_path.name == "__init__.py": + continue + + module_name = file_path.stem + try: + # Import the module + module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors") + + # Find all extractor classes in the module + for attr_name in dir(module): + attr = getattr(module, attr_name) + if (isinstance(attr, type) and + issubclass(attr, Extractor) and + attr != Extractor and + not attr_name.startswith('_')): + + # Create instance and auto-register + extractor_instance = attr() + self._auto_register_extractor(extractor_instance) + extractor_modules.append(attr_name) + + except Exception as e: + logger.warning(f"Failed to import {module_name}: {str(e)}") + continue + + # Set fallback extractor + try: + from .extractors.extractorBinary import BinaryExtractor + self.setFallback(BinaryExtractor()) + except Exception as e: + logger.warning(f"Failed to set fallback extractor: {str(e)}") + + logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}") + logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}") + except Exception as e: - logger.error(f"ExtractorRegistry: Failed to register extractors: {str(e)}") + logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}") import traceback traceback.print_exc() + + def _auto_register_extractor(self, extractor: Extractor): + """Auto-register an extractor based on its declared supported formats.""" + try: + # Register MIME types + mime_types = extractor.getSupportedMimeTypes() + for mime_type in mime_types: + self.register(mime_type, extractor) + logger.debug(f"Registered MIME type: {mime_type} โ†’ {extractor.__class__.__name__}") + + # Register file extensions + extensions = extractor.getSupportedExtensions() + for ext in extensions: + # Remove leading dot for registry key + ext_key = ext.lstrip('.') + self.register(ext_key, extractor) + logger.debug(f"Registered extension: .{ext_key} โ†’ {extractor.__class__.__name__}") + + except Exception as e: + logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}") def register(self, key: str, extractor: Extractor): self._map[key] = extractor @@ -88,6 +138,43 @@ class ExtractorRegistry: if ext in self._map: return self._map[ext] return self._fallback + + def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]: + """ + Get all supported formats from all registered extractors. + + Returns: + Dictionary with format information: + { + "extensions": { + "extractor_name": [".ext1", ".ext2", ...] + }, + "mime_types": { + "extractor_name": ["mime/type1", "mime/type2", ...] + } + } + """ + formats = {"extensions": {}, "mime_types": {}} + + # Get formats from registered extractors + for key, extractor in self._map.items(): + if hasattr(extractor, 'getSupportedExtensions'): + extensions = extractor.getSupportedExtensions() + if extensions: + formats["extensions"][key] = extensions + + if hasattr(extractor, 'getSupportedMimeTypes'): + mime_types = extractor.getSupportedMimeTypes() + if mime_types: + formats["mime_types"][key] = mime_types + + # Add fallback extractor info + if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'): + formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions() + if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'): + formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes() + + return formats class ChunkerRegistry: diff --git a/test_document_processing.py b/test_document_processing.py index 777b0ddf..41e3a3a2 100644 --- a/test_document_processing.py +++ b/test_document_processing.py @@ -51,7 +51,24 @@ async def process_documents_and_generate_summary(): return False # Find all supported document files - supported_extensions = ["*.pdf", "*.jpg", "*.jpeg", "*.png", "*.gif", "*.docx", "*.xlsx", "*.pptx", "*.ppt", "*.txt", "*.md", "*.html", "*.csv"] + supported_extensions = [ + # Document formats + "*.pdf", "*.docx", "*.xlsx", "*.pptx", "*.ppt", + # Image formats + "*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.bmp", "*.tiff", + # Text and code files + "*.txt", "*.md", "*.log", "*.rtf", "*.tex", "*.rst", "*.adoc", "*.org", "*.pod", + "*.java", "*.js", "*.jsx", "*.ts", "*.tsx", "*.py", "*.rb", "*.go", "*.rs", "*.cpp", "*.c", "*.h", "*.hpp", "*.cc", "*.cxx", + "*.cs", "*.php", "*.swift", "*.kt", "*.scala", "*.clj", "*.hs", "*.ml", "*.fs", "*.vb", "*.dart", "*.r", "*.m", "*.pl", "*.sh", + "*.html", "*.htm", "*.css", "*.scss", "*.sass", "*.less", "*.vue", "*.svelte", + "*.config", "*.ini", "*.cfg", "*.conf", "*.properties", "*.yaml", "*.yml", "*.toml", "*.json", "*.xml", + "*.bat", "*.ps1", "*.psm1", "*.psd1", "*.vbs", "*.wsf", "*.cmd", "*.com", + "*.csv", "*.tsv", "*.tab", "*.dat", "*.data", + "*.man", "*.1", "*.2", "*.3", "*.4", "*.5", "*.6", "*.7", "*.8", "*.9", "*.n", "*.l", "*.m", "*.r", "*.t", "*.x", "*.y", "*.z", + "*.diff", "*.patch", "*.gitignore", "*.dockerignore", "*.editorconfig", "*.gitattributes", + "*.env", "*.env.local", "*.env.development", "*.env.production", "*.env.test", + "*.lock", "*.lockb", "*.lockfile", "*.pkg-lock", "*.yarn-lock" + ] document_files = [] for ext in supported_extensions: document_files.extend(list(testdata_path.glob(ext))) @@ -164,6 +181,8 @@ async def process_documents_and_generate_summary(): mime_type = "text/html" elif doc_file.suffix.lower() == '.csv': mime_type = "text/csv" + elif doc_file.suffix.lower() == '.json': + mime_type = "application/json" elif doc_file.suffix.lower() in ['.txt', '.md']: mime_type = "text/plain" @@ -199,7 +218,7 @@ async def process_documents_and_generate_summary(): # Run a single end-to-end test to avoid the loop issue logger.info("๐Ÿงช Running single end-to-end test...") - userPrompt = "Analyze these documents and create a comprehensive summary for all input documents, each input document in a separate chapter summarized in 10-20 sentences." + userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?" # userPrompt = "Analyze these documents and create a fitting image for the content" @@ -215,8 +234,8 @@ async def process_documents_and_generate_summary(): prompt=userPrompt, documents=documents, options=ai_options, - outputFormat="docx", - title="Formulaire" + outputFormat="txt", + title="Kunden und Use Cases" ) logger.info(f"โœ… End-to-end test completed successfully") diff --git a/test_extractor_formats.py b/test_extractor_formats.py new file mode 100644 index 00000000..201622ff --- /dev/null +++ b/test_extractor_formats.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +""" +Test script to demonstrate enhanced extractor format support. +Shows all supported file extensions and MIME types for each extractor. +""" + +import sys +import os +from pathlib import Path + +# Add the gateway module to the path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules')) + +from modules.services.serviceExtraction.subRegistry import ExtractorRegistry + +def test_extractor_formats(): + """Test and display all supported formats from extractors.""" + print("๐Ÿ” Testing Plug-and-Play Extractor System") + print("=" * 60) + + # Create registry + registry = ExtractorRegistry() + + # Get all supported formats + formats = registry.getAllSupportedFormats() + + print("\n๐Ÿ“‹ Supported File Extensions by Extractor:") + print("-" * 50) + for extractor_name, extensions in formats["extensions"].items(): + if extensions: + print(f" {extractor_name:20} โ†’ {', '.join(extensions)}") + else: + print(f" {extractor_name:20} โ†’ (all extensions - fallback)") + + print("\n๐Ÿ“‹ Supported MIME Types by Extractor:") + print("-" * 50) + for extractor_name, mime_types in formats["mime_types"].items(): + if mime_types: + print(f" {extractor_name:20} โ†’ {', '.join(mime_types)}") + else: + print(f" {extractor_name:20} โ†’ (all MIME types - fallback)") + + # Test individual extractors + print("\n๐Ÿงช Testing Individual Extractors:") + print("-" * 50) + + # Get all registered extractors + for key, extractor in registry._map.items(): + if hasattr(extractor, 'getSupportedExtensions') and hasattr(extractor, 'getSupportedMimeTypes'): + extensions = extractor.getSupportedExtensions() + mime_types = extractor.getSupportedMimeTypes() + print(f"\n {extractor.__class__.__name__}:") + print(f" Extensions: {extensions}") + print(f" MIME Types: {mime_types}") + + # Test detection with various file types + print("\n๐Ÿ”ฌ Testing File Detection:") + print("-" * 50) + + test_files = [ + # Document formats + ("document.pdf", "application/pdf"), + ("spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), + ("presentation.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"), + ("document.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), + + # Text and code files + ("readme.txt", "text/plain"), + ("readme.md", "text/markdown"), + ("app.log", "text/plain"), + ("Main.java", "text/x-java-source"), + ("script.js", "text/javascript"), + ("component.tsx", "text/typescript"), + ("main.py", "text/x-python"), + ("config.yaml", "text/x-yaml"), + ("package.json", "application/json"), + ("data.csv", "text/csv"), + ("config.xml", "application/xml"), + ("webpage.html", "text/html"), + ("styles.css", "text/css"), + ("script.sh", "text/x-sh"), + ("Dockerfile", "text/plain"), + (".gitignore", "text/plain"), + ("app.config", "text/plain"), + ("database.sql", "text/x-sql"), + ("schema.ddl", "application/sql"), + + # Images + ("image.png", "image/png"), + ("photo.jpg", "image/jpeg"), + + # Unknown + ("unknown.xyz", "application/octet-stream") + ] + + for filename, mime_type in test_files: + extractor = registry.resolve(mime_type, filename) + if extractor: + print(f" {filename:25} ({mime_type:50}) โ†’ {extractor.__class__.__name__}") + else: + print(f" {filename:25} ({mime_type:50}) โ†’ No extractor found") + + print("\nโœ… Plug-and-Play extractor system test completed!") + print("\nKey improvements:") + print(" โ€ข ๐Ÿ”Œ TRUE PLUG-AND-PLAY: Just add extractor file, it auto-registers!") + print(" โ€ข ๐Ÿ“‹ No more manual registration of file types") + print(" โ€ข ๐Ÿ” Auto-discovery scans extractors directory") + print(" โ€ข ๐Ÿ“ Each extractor declares its own supported formats") + print(" โ€ข ๐Ÿš€ Easy to add new file types - just create new extractor") + print(" โ€ข ๐Ÿงน Clean, maintainable code with no redundancy") + print("\nTo add a new file type:") + print(" 1. Create extractorXyz.py in extractors/ directory") + print(" 2. Implement Extractor interface with getSupportedExtensions()") + print(" 3. That's it! No registry changes needed!") + +if __name__ == "__main__": + test_extractor_formats()