From 0c357dc8a9549d403915ac7c621992beb2be60c2 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Mon, 13 Oct 2025 22:03:28 +0200
Subject: [PATCH] AI system tested for all file types
---
.../extractors/extractorBinary.py | 20 ++
.../extractors/extractorCsv.py | 17 ++
.../extractors/extractorDocx.py | 18 ++
.../extractors/extractorHtml.py | 18 ++
.../extractors/extractorImage.py | 20 +-
.../extractors/extractorJson.py | 17 ++
.../extractors/extractorPdf.py | 18 ++
.../extractors/extractorPptx.py | 21 ++-
.../extractors/extractorSql.py | 56 ++++++
.../extractors/extractorText.py | 79 +++++++-
.../extractors/extractorXlsx.py | 18 ++
.../extractors/extractorXml.py | 17 ++
.../services/serviceExtraction/subRegistry.py | 177 +++++++++++++-----
test_document_processing.py | 27 ++-
test_extractor_formats.py | 117 ++++++++++++
15 files changed, 588 insertions(+), 52 deletions(-)
create mode 100644 modules/services/serviceExtraction/extractors/extractorSql.py
create mode 100644 test_extractor_formats.py
diff --git a/modules/services/serviceExtraction/extractors/extractorBinary.py b/modules/services/serviceExtraction/extractors/extractorBinary.py
index e6667fda..8a52986c 100644
--- a/modules/services/serviceExtraction/extractors/extractorBinary.py
+++ b/modules/services/serviceExtraction/extractors/extractorBinary.py
@@ -7,8 +7,28 @@ from ..subRegistry import Extractor
class BinaryExtractor(Extractor):
+ """
+ Fallback extractor for unsupported file types.
+
+ This extractor handles any file type that doesn't match other extractors.
+ It encodes the file as base64 and marks it as binary data.
+
+ Supported formats:
+ - All file types (fallback)
+ - MIME types: application/octet-stream (default)
+ - File extensions: All (fallback)
+ """
+
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return True
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions (all)."""
+ return [] # Accepts all extensions as fallback
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types (all)."""
+ return [] # Accepts all MIME types as fallback
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/octet-stream"
diff --git a/modules/services/serviceExtraction/extractors/extractorCsv.py b/modules/services/serviceExtraction/extractors/extractorCsv.py
index 27233979..fb1c642e 100644
--- a/modules/services/serviceExtraction/extractors/extractorCsv.py
+++ b/modules/services/serviceExtraction/extractors/extractorCsv.py
@@ -6,8 +6,25 @@ from ..subRegistry import Extractor
class CsvExtractor(Extractor):
+ """
+ Extractor for CSV files.
+
+ Supported formats:
+ - MIME types: text/csv
+ - File extensions: .csv
+ - Special handling: Treats as table data
+ """
+
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions."""
+ return [".csv"]
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return ["text/csv"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
diff --git a/modules/services/serviceExtraction/extractors/extractorDocx.py b/modules/services/serviceExtraction/extractors/extractorDocx.py
index 51384ffd..bce9f04b 100644
--- a/modules/services/serviceExtraction/extractors/extractorDocx.py
+++ b/modules/services/serviceExtraction/extractors/extractorDocx.py
@@ -7,6 +7,16 @@ from ..subRegistry import Extractor
class DocxExtractor(Extractor):
+ """
+ Extractor for Microsoft Word documents.
+
+ Supported formats:
+ - MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
+ - File extensions: .docx
+ - Special handling: Extracts paragraphs and tables (converts tables to CSV)
+ - Dependencies: python-docx
+ """
+
def __init__(self):
self._loaded = False
self._haveLibs = False
@@ -24,6 +34,14 @@ class DocxExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions."""
+ return [".docx"]
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()
diff --git a/modules/services/serviceExtraction/extractors/extractorHtml.py b/modules/services/serviceExtraction/extractors/extractorHtml.py
index 09da02f4..730df49c 100644
--- a/modules/services/serviceExtraction/extractors/extractorHtml.py
+++ b/modules/services/serviceExtraction/extractors/extractorHtml.py
@@ -7,8 +7,26 @@ from ..subRegistry import Extractor
class HtmlExtractor(Extractor):
+ """
+ Extractor for HTML files.
+
+ Supported formats:
+ - MIME types: text/html
+ - File extensions: .html, .htm
+ - Special handling: Uses BeautifulSoup for parsing
+ - Dependencies: beautifulsoup4
+ """
+
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions."""
+ return [".html", ".htm"]
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return ["text/html"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "text/html"
diff --git a/modules/services/serviceExtraction/extractors/extractorImage.py b/modules/services/serviceExtraction/extractors/extractorImage.py
index 3f94459c..578e0148 100644
--- a/modules/services/serviceExtraction/extractors/extractorImage.py
+++ b/modules/services/serviceExtraction/extractors/extractorImage.py
@@ -10,8 +10,26 @@ logger = logging.getLogger(__name__)
class ImageExtractor(Extractor):
+ """
+ Extractor for image files.
+
+ Supported formats:
+ - MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
+ - File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
+ - Special handling: GIF files are converted to PNG during extraction
+ """
+
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
- return (mimeType or "").startswith("image/")
+ return ((mimeType or "").startswith("image/") or
+ (fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions."""
+ return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "image/unknown"
diff --git a/modules/services/serviceExtraction/extractors/extractorJson.py b/modules/services/serviceExtraction/extractors/extractorJson.py
index 86eac791..04ab1c10 100644
--- a/modules/services/serviceExtraction/extractors/extractorJson.py
+++ b/modules/services/serviceExtraction/extractors/extractorJson.py
@@ -7,8 +7,25 @@ from ..subRegistry import Extractor
class JsonExtractor(Extractor):
+ """
+ Extractor for JSON files.
+
+ Supported formats:
+ - MIME types: application/json
+ - File extensions: .json
+ - Special handling: Validates JSON format, falls back to text if invalid
+ """
+
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions."""
+ return [".json"]
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return ["application/json"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/json"
diff --git a/modules/services/serviceExtraction/extractors/extractorPdf.py b/modules/services/serviceExtraction/extractors/extractorPdf.py
index 59c88dc7..4f0290ec 100644
--- a/modules/services/serviceExtraction/extractors/extractorPdf.py
+++ b/modules/services/serviceExtraction/extractors/extractorPdf.py
@@ -8,6 +8,16 @@ from ..subRegistry import Extractor
class PdfExtractor(Extractor):
+ """
+ Extractor for PDF files.
+
+ Supported formats:
+ - MIME types: application/pdf
+ - File extensions: .pdf
+ - Special handling: Extracts text per page and embedded images
+ - Dependencies: PyPDF2, PyMuPDF (fitz)
+ """
+
def __init__(self):
self._loaded = False
self._haveLibs = False
@@ -26,6 +36,14 @@ class PdfExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions."""
+ return [".pdf"]
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return ["application/pdf"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()
diff --git a/modules/services/serviceExtraction/extractors/extractorPptx.py b/modules/services/serviceExtraction/extractors/extractorPptx.py
index 096b7925..1a5a7ff8 100644
--- a/modules/services/serviceExtraction/extractors/extractorPptx.py
+++ b/modules/services/serviceExtraction/extractors/extractorPptx.py
@@ -8,7 +8,15 @@ logger = logging.getLogger(__name__)
class PptxExtractor(Extractor):
- """Extractor for PowerPoint (.pptx) files using python-pptx library."""
+ """
+ Extractor for PowerPoint files.
+
+ Supported formats:
+ - MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
+ - File extensions: .pptx, .ppt
+ - Special handling: Extracts slide content, tables, and images
+ - Dependencies: python-pptx
+ """
def __init__(self):
self._loaded = False
@@ -31,6 +39,17 @@ class PptxExtractor(Extractor):
"application/vnd.ms-powerpoint"
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions."""
+ return [".pptx", ".ppt"]
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return [
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ "application/vnd.ms-powerpoint"
+ ]
+
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
"""
Extract content from PowerPoint files.
diff --git a/modules/services/serviceExtraction/extractors/extractorSql.py b/modules/services/serviceExtraction/extractors/extractorSql.py
new file mode 100644
index 00000000..c751d7ca
--- /dev/null
+++ b/modules/services/serviceExtraction/extractors/extractorSql.py
@@ -0,0 +1,56 @@
+from typing import Any, Dict, List
+
+from modules.datamodels.datamodelExtraction import ContentPart
+from ..subUtils import makeId
+from ..subRegistry import Extractor
+
+
+class SqlExtractor(Extractor):
+ """
+ Extractor for SQL files.
+
+ Supported formats:
+ - MIME types: text/x-sql, application/sql
+ - File extensions: .sql, .ddl, .dml, .dcl, .tcl
+ - Special handling: Treats as structured text with SQL syntax
+ """
+
+ def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
+ return (mimeType in ("text/x-sql", "application/sql") or
+ (fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions."""
+ return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return ["text/x-sql", "application/sql"]
+
+ def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
+ fileName = context.get("fileName")
+ mimeType = context.get("mimeType") or "text/x-sql"
+ data = fileBytes.decode("utf-8", errors="replace")
+
+ # Add SQL-specific metadata
+ metadata = {
+ "size": len(fileBytes),
+ "file_type": "sql",
+ "line_count": len(data.splitlines()),
+ "has_select": "SELECT" in data.upper(),
+ "has_insert": "INSERT" in data.upper(),
+ "has_update": "UPDATE" in data.upper(),
+ "has_delete": "DELETE" in data.upper(),
+ "has_create": "CREATE" in data.upper(),
+ "has_drop": "DROP" in data.upper()
+ }
+
+ return [ContentPart(
+ id=makeId(),
+ parentId=None,
+ label="main",
+ typeGroup="structure",
+ mimeType=mimeType,
+ data=data,
+ metadata=metadata
+ )]
diff --git a/modules/services/serviceExtraction/extractors/extractorText.py b/modules/services/serviceExtraction/extractors/extractorText.py
index a6d92bc1..3cd0ebdf 100644
--- a/modules/services/serviceExtraction/extractors/extractorText.py
+++ b/modules/services/serviceExtraction/extractors/extractorText.py
@@ -6,8 +6,85 @@ from ..subRegistry import Extractor
class TextExtractor(Extractor):
+ """
+ Extractor for plain text files and code files.
+
+ Supported formats:
+ - MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
+ - File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
+ """
+
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
- return mimeType in ("text/plain", "text/markdown")
+ # Check MIME types
+ if mimeType and mimeType.startswith("text/"):
+ return True
+
+ # Check file extensions
+ if fileName:
+ ext = fileName.lower()
+ return ext.endswith((
+ # Basic text files
+ ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
+ # Programming languages
+ ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
+ ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
+ # Web technologies
+ ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
+ # Configuration files
+ ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
+ # Scripts and automation
+ ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
+ # Data files
+ ".csv", ".tsv", ".tab", ".dat", ".data",
+ # Documentation
+ ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
+ # Other text formats
+ ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
+ ".env", ".env.local", ".env.development", ".env.production", ".env.test",
+ ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
+ ))
+
+ return False
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions."""
+ return [
+ # Basic text files
+ ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
+ # Programming languages
+ ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
+ ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
+ # Web technologies
+ ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
+ # Configuration files
+ ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
+ # Scripts and automation
+ ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
+ # Data files
+ ".csv", ".tsv", ".tab", ".dat", ".data",
+ # Documentation
+ ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
+ # Other text formats
+ ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
+ ".env", ".env.local", ".env.development", ".env.production", ".env.test",
+ ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
+ ]
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return [
+ "text/plain", "text/markdown", "text/x-python", "text/x-java-source",
+ "text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
+ "text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
+ "text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
+ "text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
+ "text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
+ "text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
+ "text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
+ "text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
+ "application/x-yaml", "application/x-toml", "application/x-ini",
+ "application/x-config", "application/x-properties", "application/x-log"
+ ]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
diff --git a/modules/services/serviceExtraction/extractors/extractorXlsx.py b/modules/services/serviceExtraction/extractors/extractorXlsx.py
index ea6396a2..af346419 100644
--- a/modules/services/serviceExtraction/extractors/extractorXlsx.py
+++ b/modules/services/serviceExtraction/extractors/extractorXlsx.py
@@ -8,6 +8,16 @@ from ..subRegistry import Extractor
class XlsxExtractor(Extractor):
+ """
+ Extractor for Microsoft Excel spreadsheets.
+
+ Supported formats:
+ - MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
+ - File extensions: .xlsx, .xlsm
+ - Special handling: Extracts all sheets as CSV data
+ - Dependencies: openpyxl
+ """
+
def __init__(self):
self._loaded = False
self._haveLibs = False
@@ -26,6 +36,14 @@ class XlsxExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions."""
+ return [".xlsx", ".xlsm"]
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()
diff --git a/modules/services/serviceExtraction/extractors/extractorXml.py b/modules/services/serviceExtraction/extractors/extractorXml.py
index 5aabea35..c7d034ad 100644
--- a/modules/services/serviceExtraction/extractors/extractorXml.py
+++ b/modules/services/serviceExtraction/extractors/extractorXml.py
@@ -7,8 +7,25 @@ from ..subRegistry import Extractor
class XmlExtractor(Extractor):
+ """
+ Extractor for XML files.
+
+ Supported formats:
+ - MIME types: application/xml
+ - File extensions: .xml, .rss, .atom
+ - Special handling: Uses ElementTree for parsing
+ """
+
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions."""
+ return [".xml", ".rss", ".atom"]
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return ["application/xml"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/xml"
diff --git a/modules/services/serviceExtraction/subRegistry.py b/modules/services/serviceExtraction/subRegistry.py
index ae994bbf..eb2ece4d 100644
--- a/modules/services/serviceExtraction/subRegistry.py
+++ b/modules/services/serviceExtraction/subRegistry.py
@@ -7,11 +7,31 @@ logger = logging.getLogger(__name__)
class Extractor:
+ """
+ Base class for all document extractors.
+
+ Each extractor should implement:
+ - detect(): Check if this extractor can handle the given file
+ - extract(): Extract content from the file
+ - getSupportedExtensions(): Return supported file extensions
+ - getSupportedMimeTypes(): Return supported MIME types
+ """
+
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
+ """Check if this extractor can handle the given file."""
return False
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
+ """Extract content from the file bytes."""
raise NotImplementedError
+
+ def getSupportedExtensions(self) -> list[str]:
+ """Return list of supported file extensions (including dots)."""
+ return []
+
+ def getSupportedMimeTypes(self) -> list[str]:
+ """Return list of supported MIME types."""
+ return []
class Chunker:
@@ -23,55 +43,85 @@ class ExtractorRegistry:
def __init__(self):
self._map: Dict[str, Extractor] = {}
self._fallback: Optional[Extractor] = None
- # Register built-ins
+ self._auto_discover_extractors()
+
+ def _auto_discover_extractors(self):
+ """Auto-discover and register all extractors from the extractors directory."""
try:
- from .extractors.extractorText import TextExtractor
- from .extractors.extractorCsv import CsvExtractor
- from .extractors.extractorJson import JsonExtractor
- from .extractors.extractorXml import XmlExtractor
- from .extractors.extractorHtml import HtmlExtractor
- from .extractors.extractorPdf import PdfExtractor
- from .extractors.extractorDocx import DocxExtractor
- from .extractors.extractorXlsx import XlsxExtractor
- from .extractors.extractorPptx import PptxExtractor
- from .extractors.extractorImage import ImageExtractor
- from .extractors.extractorBinary import BinaryExtractor
- self.register("text/plain", TextExtractor())
- self.register("text/markdown", TextExtractor())
- self.register("text/csv", CsvExtractor())
- self.register("application/json", JsonExtractor())
- self.register("application/xml", XmlExtractor())
- self.register("text/html", HtmlExtractor())
- self.register("application/pdf", PdfExtractor())
- self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
- self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
- self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor())
- self.register("application/vnd.ms-powerpoint", PptxExtractor())
- # images
- self.register("image/jpeg", ImageExtractor())
- self.register("image/png", ImageExtractor())
- self.register("image/gif", ImageExtractor())
- # extension fallbacks
- self.register("txt", TextExtractor())
- self.register("md", TextExtractor())
- self.register("csv", CsvExtractor())
- self.register("json", JsonExtractor())
- self.register("xml", XmlExtractor())
- self.register("html", HtmlExtractor())
- self.register("htm", HtmlExtractor())
- self.register("pdf", PdfExtractor())
- self.register("docx", DocxExtractor())
- self.register("xlsx", XlsxExtractor())
- self.register("xlsm", XlsxExtractor())
- self.register("pptx", PptxExtractor())
- self.register("ppt", PptxExtractor())
- # fallback
- self.setFallback(BinaryExtractor())
- logger.info(f"ExtractorRegistry: Successfully registered {len(self._map)} extractors")
+ import os
+ import importlib
+ from pathlib import Path
+
+ # Get the extractors directory
+ current_dir = Path(__file__).parent
+ extractors_dir = current_dir / "extractors"
+
+ if not extractors_dir.exists():
+ logger.error(f"Extractors directory not found: {extractors_dir}")
+ return
+
+ # Import all extractor modules
+ extractor_modules = []
+ for file_path in extractors_dir.glob("extractor*.py"):
+ if file_path.name == "__init__.py":
+ continue
+
+ module_name = file_path.stem
+ try:
+ # Import the module
+ module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
+
+ # Find all extractor classes in the module
+ for attr_name in dir(module):
+ attr = getattr(module, attr_name)
+ if (isinstance(attr, type) and
+ issubclass(attr, Extractor) and
+ attr != Extractor and
+ not attr_name.startswith('_')):
+
+ # Create instance and auto-register
+ extractor_instance = attr()
+ self._auto_register_extractor(extractor_instance)
+ extractor_modules.append(attr_name)
+
+ except Exception as e:
+ logger.warning(f"Failed to import {module_name}: {str(e)}")
+ continue
+
+ # Set fallback extractor
+ try:
+ from .extractors.extractorBinary import BinaryExtractor
+ self.setFallback(BinaryExtractor())
+ except Exception as e:
+ logger.warning(f"Failed to set fallback extractor: {str(e)}")
+
+ logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
+ logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
+
except Exception as e:
- logger.error(f"ExtractorRegistry: Failed to register extractors: {str(e)}")
+ logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
import traceback
traceback.print_exc()
+
+ def _auto_register_extractor(self, extractor: Extractor):
+ """Auto-register an extractor based on its declared supported formats."""
+ try:
+ # Register MIME types
+ mime_types = extractor.getSupportedMimeTypes()
+ for mime_type in mime_types:
+ self.register(mime_type, extractor)
+ logger.debug(f"Registered MIME type: {mime_type} โ {extractor.__class__.__name__}")
+
+ # Register file extensions
+ extensions = extractor.getSupportedExtensions()
+ for ext in extensions:
+ # Remove leading dot for registry key
+ ext_key = ext.lstrip('.')
+ self.register(ext_key, extractor)
+ logger.debug(f"Registered extension: .{ext_key} โ {extractor.__class__.__name__}")
+
+ except Exception as e:
+ logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
def register(self, key: str, extractor: Extractor):
self._map[key] = extractor
@@ -88,6 +138,43 @@ class ExtractorRegistry:
if ext in self._map:
return self._map[ext]
return self._fallback
+
+ def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
+ """
+ Get all supported formats from all registered extractors.
+
+ Returns:
+ Dictionary with format information:
+ {
+ "extensions": {
+ "extractor_name": [".ext1", ".ext2", ...]
+ },
+ "mime_types": {
+ "extractor_name": ["mime/type1", "mime/type2", ...]
+ }
+ }
+ """
+ formats = {"extensions": {}, "mime_types": {}}
+
+ # Get formats from registered extractors
+ for key, extractor in self._map.items():
+ if hasattr(extractor, 'getSupportedExtensions'):
+ extensions = extractor.getSupportedExtensions()
+ if extensions:
+ formats["extensions"][key] = extensions
+
+ if hasattr(extractor, 'getSupportedMimeTypes'):
+ mime_types = extractor.getSupportedMimeTypes()
+ if mime_types:
+ formats["mime_types"][key] = mime_types
+
+ # Add fallback extractor info
+ if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
+ formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
+ if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
+ formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
+
+ return formats
class ChunkerRegistry:
diff --git a/test_document_processing.py b/test_document_processing.py
index 777b0ddf..41e3a3a2 100644
--- a/test_document_processing.py
+++ b/test_document_processing.py
@@ -51,7 +51,24 @@ async def process_documents_and_generate_summary():
return False
# Find all supported document files
- supported_extensions = ["*.pdf", "*.jpg", "*.jpeg", "*.png", "*.gif", "*.docx", "*.xlsx", "*.pptx", "*.ppt", "*.txt", "*.md", "*.html", "*.csv"]
+ supported_extensions = [
+ # Document formats
+ "*.pdf", "*.docx", "*.xlsx", "*.pptx", "*.ppt",
+ # Image formats
+ "*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.bmp", "*.tiff",
+ # Text and code files
+ "*.txt", "*.md", "*.log", "*.rtf", "*.tex", "*.rst", "*.adoc", "*.org", "*.pod",
+ "*.java", "*.js", "*.jsx", "*.ts", "*.tsx", "*.py", "*.rb", "*.go", "*.rs", "*.cpp", "*.c", "*.h", "*.hpp", "*.cc", "*.cxx",
+ "*.cs", "*.php", "*.swift", "*.kt", "*.scala", "*.clj", "*.hs", "*.ml", "*.fs", "*.vb", "*.dart", "*.r", "*.m", "*.pl", "*.sh",
+ "*.html", "*.htm", "*.css", "*.scss", "*.sass", "*.less", "*.vue", "*.svelte",
+ "*.config", "*.ini", "*.cfg", "*.conf", "*.properties", "*.yaml", "*.yml", "*.toml", "*.json", "*.xml",
+ "*.bat", "*.ps1", "*.psm1", "*.psd1", "*.vbs", "*.wsf", "*.cmd", "*.com",
+ "*.csv", "*.tsv", "*.tab", "*.dat", "*.data",
+ "*.man", "*.1", "*.2", "*.3", "*.4", "*.5", "*.6", "*.7", "*.8", "*.9", "*.n", "*.l", "*.m", "*.r", "*.t", "*.x", "*.y", "*.z",
+ "*.diff", "*.patch", "*.gitignore", "*.dockerignore", "*.editorconfig", "*.gitattributes",
+ "*.env", "*.env.local", "*.env.development", "*.env.production", "*.env.test",
+ "*.lock", "*.lockb", "*.lockfile", "*.pkg-lock", "*.yarn-lock"
+ ]
document_files = []
for ext in supported_extensions:
document_files.extend(list(testdata_path.glob(ext)))
@@ -164,6 +181,8 @@ async def process_documents_and_generate_summary():
mime_type = "text/html"
elif doc_file.suffix.lower() == '.csv':
mime_type = "text/csv"
+ elif doc_file.suffix.lower() == '.json':
+ mime_type = "application/json"
elif doc_file.suffix.lower() in ['.txt', '.md']:
mime_type = "text/plain"
@@ -199,7 +218,7 @@ async def process_documents_and_generate_summary():
# Run a single end-to-end test to avoid the loop issue
logger.info("๐งช Running single end-to-end test...")
- userPrompt = "Analyze these documents and create a comprehensive summary for all input documents, each input document in a separate chapter summarized in 10-20 sentences."
+ userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
# userPrompt = "Analyze these documents and create a fitting image for the content"
@@ -215,8 +234,8 @@ async def process_documents_and_generate_summary():
prompt=userPrompt,
documents=documents,
options=ai_options,
- outputFormat="docx",
- title="Formulaire"
+ outputFormat="txt",
+ title="Kunden und Use Cases"
)
logger.info(f"โ
End-to-end test completed successfully")
diff --git a/test_extractor_formats.py b/test_extractor_formats.py
new file mode 100644
index 00000000..201622ff
--- /dev/null
+++ b/test_extractor_formats.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+Test script to demonstrate enhanced extractor format support.
+Shows all supported file extensions and MIME types for each extractor.
+"""
+
+import sys
+import os
+from pathlib import Path
+
+# Add the gateway module to the path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
+
+from modules.services.serviceExtraction.subRegistry import ExtractorRegistry
+
+def test_extractor_formats():
+ """Test and display all supported formats from extractors."""
+ print("๐ Testing Plug-and-Play Extractor System")
+ print("=" * 60)
+
+ # Create registry
+ registry = ExtractorRegistry()
+
+ # Get all supported formats
+ formats = registry.getAllSupportedFormats()
+
+ print("\n๐ Supported File Extensions by Extractor:")
+ print("-" * 50)
+ for extractor_name, extensions in formats["extensions"].items():
+ if extensions:
+ print(f" {extractor_name:20} โ {', '.join(extensions)}")
+ else:
+ print(f" {extractor_name:20} โ (all extensions - fallback)")
+
+ print("\n๐ Supported MIME Types by Extractor:")
+ print("-" * 50)
+ for extractor_name, mime_types in formats["mime_types"].items():
+ if mime_types:
+ print(f" {extractor_name:20} โ {', '.join(mime_types)}")
+ else:
+ print(f" {extractor_name:20} โ (all MIME types - fallback)")
+
+ # Test individual extractors
+ print("\n๐งช Testing Individual Extractors:")
+ print("-" * 50)
+
+ # Get all registered extractors
+ for key, extractor in registry._map.items():
+ if hasattr(extractor, 'getSupportedExtensions') and hasattr(extractor, 'getSupportedMimeTypes'):
+ extensions = extractor.getSupportedExtensions()
+ mime_types = extractor.getSupportedMimeTypes()
+ print(f"\n {extractor.__class__.__name__}:")
+ print(f" Extensions: {extensions}")
+ print(f" MIME Types: {mime_types}")
+
+ # Test detection with various file types
+ print("\n๐ฌ Testing File Detection:")
+ print("-" * 50)
+
+ test_files = [
+ # Document formats
+ ("document.pdf", "application/pdf"),
+ ("spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
+ ("presentation.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
+ ("document.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
+
+ # Text and code files
+ ("readme.txt", "text/plain"),
+ ("readme.md", "text/markdown"),
+ ("app.log", "text/plain"),
+ ("Main.java", "text/x-java-source"),
+ ("script.js", "text/javascript"),
+ ("component.tsx", "text/typescript"),
+ ("main.py", "text/x-python"),
+ ("config.yaml", "text/x-yaml"),
+ ("package.json", "application/json"),
+ ("data.csv", "text/csv"),
+ ("config.xml", "application/xml"),
+ ("webpage.html", "text/html"),
+ ("styles.css", "text/css"),
+ ("script.sh", "text/x-sh"),
+ ("Dockerfile", "text/plain"),
+ (".gitignore", "text/plain"),
+ ("app.config", "text/plain"),
+ ("database.sql", "text/x-sql"),
+ ("schema.ddl", "application/sql"),
+
+ # Images
+ ("image.png", "image/png"),
+ ("photo.jpg", "image/jpeg"),
+
+ # Unknown
+ ("unknown.xyz", "application/octet-stream")
+ ]
+
+ for filename, mime_type in test_files:
+ extractor = registry.resolve(mime_type, filename)
+ if extractor:
+ print(f" {filename:25} ({mime_type:50}) โ {extractor.__class__.__name__}")
+ else:
+ print(f" {filename:25} ({mime_type:50}) โ No extractor found")
+
+ print("\nโ
Plug-and-Play extractor system test completed!")
+ print("\nKey improvements:")
+ print(" โข ๐ TRUE PLUG-AND-PLAY: Just add extractor file, it auto-registers!")
+ print(" โข ๐ No more manual registration of file types")
+ print(" โข ๐ Auto-discovery scans extractors directory")
+ print(" โข ๐ Each extractor declares its own supported formats")
+ print(" โข ๐ Easy to add new file types - just create new extractor")
+ print(" โข ๐งน Clean, maintainable code with no redundancy")
+ print("\nTo add a new file type:")
+ print(" 1. Create extractorXyz.py in extractors/ directory")
+ print(" 2. Implement Extractor interface with getSupportedExtensions()")
+ print(" 3. That's it! No registry changes needed!")
+
+if __name__ == "__main__":
+ test_extractor_formats()