AI system tested for all file types

This commit is contained in:
ValueOn AG 2025-10-13 22:03:28 +02:00
parent 2e471ca3f7
commit 0c357dc8a9
15 changed files with 588 additions and 52 deletions

View file

@ -7,8 +7,28 @@ from ..subRegistry import Extractor
class BinaryExtractor(Extractor):
"""
Fallback extractor for unsupported file types.
This extractor handles any file type that doesn't match other extractors.
It encodes the file as base64 and marks it as binary data.
Supported formats:
- All file types (fallback)
- MIME types: application/octet-stream (default)
- File extensions: All (fallback)
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return True
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions (all)."""
return [] # Accepts all extensions as fallback
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types (all)."""
return [] # Accepts all MIME types as fallback
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/octet-stream"

View file

@ -6,8 +6,25 @@ from ..subRegistry import Extractor
class CsvExtractor(Extractor):
"""
Extractor for CSV files.
Supported formats:
- MIME types: text/csv
- File extensions: .csv
- Special handling: Treats as table data
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".csv"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/csv"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")

View file

@ -7,6 +7,16 @@ from ..subRegistry import Extractor
class DocxExtractor(Extractor):
"""
Extractor for Microsoft Word documents.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
- File extensions: .docx
- Special handling: Extracts paragraphs and tables (converts tables to CSV)
- Dependencies: python-docx
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
@ -24,6 +34,14 @@ class DocxExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".docx"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()

View file

@ -7,8 +7,26 @@ from ..subRegistry import Extractor
class HtmlExtractor(Extractor):
"""
Extractor for HTML files.
Supported formats:
- MIME types: text/html
- File extensions: .html, .htm
- Special handling: Uses BeautifulSoup for parsing
- Dependencies: beautifulsoup4
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".html", ".htm"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/html"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "text/html"

View file

@ -10,8 +10,26 @@ logger = logging.getLogger(__name__)
class ImageExtractor(Extractor):
"""
Extractor for image files.
Supported formats:
- MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
- File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
- Special handling: GIF files are converted to PNG during extraction
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return (mimeType or "").startswith("image/")
return ((mimeType or "").startswith("image/") or
(fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "image/unknown"

View file

@ -7,8 +7,25 @@ from ..subRegistry import Extractor
class JsonExtractor(Extractor):
"""
Extractor for JSON files.
Supported formats:
- MIME types: application/json
- File extensions: .json
- Special handling: Validates JSON format, falls back to text if invalid
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".json"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/json"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/json"

View file

@ -8,6 +8,16 @@ from ..subRegistry import Extractor
class PdfExtractor(Extractor):
"""
Extractor for PDF files.
Supported formats:
- MIME types: application/pdf
- File extensions: .pdf
- Special handling: Extracts text per page and embedded images
- Dependencies: PyPDF2, PyMuPDF (fitz)
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
@ -26,6 +36,14 @@ class PdfExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".pdf"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/pdf"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()

View file

@ -8,7 +8,15 @@ logger = logging.getLogger(__name__)
class PptxExtractor(Extractor):
"""Extractor for PowerPoint (.pptx) files using python-pptx library."""
"""
Extractor for PowerPoint files.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
- File extensions: .pptx, .ppt
- Special handling: Extracts slide content, tables, and images
- Dependencies: python-pptx
"""
def __init__(self):
self._loaded = False
@ -31,6 +39,17 @@ class PptxExtractor(Extractor):
"application/vnd.ms-powerpoint"
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".pptx", ".ppt"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint"
]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
"""
Extract content from PowerPoint files.

View file

@ -0,0 +1,56 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class SqlExtractor(Extractor):
"""
Extractor for SQL files.
Supported formats:
- MIME types: text/x-sql, application/sql
- File extensions: .sql, .ddl, .dml, .dcl, .tcl
- Special handling: Treats as structured text with SQL syntax
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return (mimeType in ("text/x-sql", "application/sql") or
(fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/x-sql", "application/sql"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
mimeType = context.get("mimeType") or "text/x-sql"
data = fileBytes.decode("utf-8", errors="replace")
# Add SQL-specific metadata
metadata = {
"size": len(fileBytes),
"file_type": "sql",
"line_count": len(data.splitlines()),
"has_select": "SELECT" in data.upper(),
"has_insert": "INSERT" in data.upper(),
"has_update": "UPDATE" in data.upper(),
"has_delete": "DELETE" in data.upper(),
"has_create": "CREATE" in data.upper(),
"has_drop": "DROP" in data.upper()
}
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="structure",
mimeType=mimeType,
data=data,
metadata=metadata
)]

View file

@ -6,8 +6,85 @@ from ..subRegistry import Extractor
class TextExtractor(Extractor):
"""
Extractor for plain text files and code files.
Supported formats:
- MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
- File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType in ("text/plain", "text/markdown")
# Check MIME types
if mimeType and mimeType.startswith("text/"):
return True
# Check file extensions
if fileName:
ext = fileName.lower()
return ext.endswith((
# Basic text files
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
# Programming languages
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
# Web technologies
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
# Configuration files
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
# Scripts and automation
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
# Data files
".csv", ".tsv", ".tab", ".dat", ".data",
# Documentation
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
# Other text formats
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
".env", ".env.local", ".env.development", ".env.production", ".env.test",
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
))
return False
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [
# Basic text files
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
# Programming languages
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
# Web technologies
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
# Configuration files
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
# Scripts and automation
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
# Data files
".csv", ".tsv", ".tab", ".dat", ".data",
# Documentation
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
# Other text formats
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
".env", ".env.local", ".env.development", ".env.production", ".env.test",
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return [
"text/plain", "text/markdown", "text/x-python", "text/x-java-source",
"text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
"text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
"text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
"text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
"text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
"text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
"text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
"text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
"application/x-yaml", "application/x-toml", "application/x-ini",
"application/x-config", "application/x-properties", "application/x-log"
]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")

View file

@ -8,6 +8,16 @@ from ..subRegistry import Extractor
class XlsxExtractor(Extractor):
"""
Extractor for Microsoft Excel spreadsheets.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
- File extensions: .xlsx, .xlsm
- Special handling: Extracts all sheets as CSV data
- Dependencies: openpyxl
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
@ -26,6 +36,14 @@ class XlsxExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".xlsx", ".xlsm"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()

View file

@ -7,8 +7,25 @@ from ..subRegistry import Extractor
class XmlExtractor(Extractor):
"""
Extractor for XML files.
Supported formats:
- MIME types: application/xml
- File extensions: .xml, .rss, .atom
- Special handling: Uses ElementTree for parsing
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".xml", ".rss", ".atom"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/xml"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/xml"

View file

@ -7,11 +7,31 @@ logger = logging.getLogger(__name__)
class Extractor:
"""
Base class for all document extractors.
Each extractor should implement:
- detect(): Check if this extractor can handle the given file
- extract(): Extract content from the file
- getSupportedExtensions(): Return supported file extensions
- getSupportedMimeTypes(): Return supported MIME types
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
"""Check if this extractor can handle the given file."""
return False
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
"""Extract content from the file bytes."""
raise NotImplementedError
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions (including dots)."""
return []
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return []
class Chunker:
@ -23,55 +43,85 @@ class ExtractorRegistry:
def __init__(self):
self._map: Dict[str, Extractor] = {}
self._fallback: Optional[Extractor] = None
# Register built-ins
self._auto_discover_extractors()
def _auto_discover_extractors(self):
"""Auto-discover and register all extractors from the extractors directory."""
try:
from .extractors.extractorText import TextExtractor
from .extractors.extractorCsv import CsvExtractor
from .extractors.extractorJson import JsonExtractor
from .extractors.extractorXml import XmlExtractor
from .extractors.extractorHtml import HtmlExtractor
from .extractors.extractorPdf import PdfExtractor
from .extractors.extractorDocx import DocxExtractor
from .extractors.extractorXlsx import XlsxExtractor
from .extractors.extractorPptx import PptxExtractor
from .extractors.extractorImage import ImageExtractor
from .extractors.extractorBinary import BinaryExtractor
self.register("text/plain", TextExtractor())
self.register("text/markdown", TextExtractor())
self.register("text/csv", CsvExtractor())
self.register("application/json", JsonExtractor())
self.register("application/xml", XmlExtractor())
self.register("text/html", HtmlExtractor())
self.register("application/pdf", PdfExtractor())
self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor())
self.register("application/vnd.ms-powerpoint", PptxExtractor())
# images
self.register("image/jpeg", ImageExtractor())
self.register("image/png", ImageExtractor())
self.register("image/gif", ImageExtractor())
# extension fallbacks
self.register("txt", TextExtractor())
self.register("md", TextExtractor())
self.register("csv", CsvExtractor())
self.register("json", JsonExtractor())
self.register("xml", XmlExtractor())
self.register("html", HtmlExtractor())
self.register("htm", HtmlExtractor())
self.register("pdf", PdfExtractor())
self.register("docx", DocxExtractor())
self.register("xlsx", XlsxExtractor())
self.register("xlsm", XlsxExtractor())
self.register("pptx", PptxExtractor())
self.register("ppt", PptxExtractor())
# fallback
self.setFallback(BinaryExtractor())
logger.info(f"ExtractorRegistry: Successfully registered {len(self._map)} extractors")
import os
import importlib
from pathlib import Path
# Get the extractors directory
current_dir = Path(__file__).parent
extractors_dir = current_dir / "extractors"
if not extractors_dir.exists():
logger.error(f"Extractors directory not found: {extractors_dir}")
return
# Import all extractor modules
extractor_modules = []
for file_path in extractors_dir.glob("extractor*.py"):
if file_path.name == "__init__.py":
continue
module_name = file_path.stem
try:
# Import the module
module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
# Find all extractor classes in the module
for attr_name in dir(module):
attr = getattr(module, attr_name)
if (isinstance(attr, type) and
issubclass(attr, Extractor) and
attr != Extractor and
not attr_name.startswith('_')):
# Create instance and auto-register
extractor_instance = attr()
self._auto_register_extractor(extractor_instance)
extractor_modules.append(attr_name)
except Exception as e:
logger.warning(f"Failed to import {module_name}: {str(e)}")
continue
# Set fallback extractor
try:
from .extractors.extractorBinary import BinaryExtractor
self.setFallback(BinaryExtractor())
except Exception as e:
logger.warning(f"Failed to set fallback extractor: {str(e)}")
logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
except Exception as e:
logger.error(f"ExtractorRegistry: Failed to register extractors: {str(e)}")
logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
import traceback
traceback.print_exc()
def _auto_register_extractor(self, extractor: Extractor):
"""Auto-register an extractor based on its declared supported formats."""
try:
# Register MIME types
mime_types = extractor.getSupportedMimeTypes()
for mime_type in mime_types:
self.register(mime_type, extractor)
logger.debug(f"Registered MIME type: {mime_type}{extractor.__class__.__name__}")
# Register file extensions
extensions = extractor.getSupportedExtensions()
for ext in extensions:
# Remove leading dot for registry key
ext_key = ext.lstrip('.')
self.register(ext_key, extractor)
logger.debug(f"Registered extension: .{ext_key}{extractor.__class__.__name__}")
except Exception as e:
logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
def register(self, key: str, extractor: Extractor):
self._map[key] = extractor
@ -88,6 +138,43 @@ class ExtractorRegistry:
if ext in self._map:
return self._map[ext]
return self._fallback
def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
"""
Get all supported formats from all registered extractors.
Returns:
Dictionary with format information:
{
"extensions": {
"extractor_name": [".ext1", ".ext2", ...]
},
"mime_types": {
"extractor_name": ["mime/type1", "mime/type2", ...]
}
}
"""
formats = {"extensions": {}, "mime_types": {}}
# Get formats from registered extractors
for key, extractor in self._map.items():
if hasattr(extractor, 'getSupportedExtensions'):
extensions = extractor.getSupportedExtensions()
if extensions:
formats["extensions"][key] = extensions
if hasattr(extractor, 'getSupportedMimeTypes'):
mime_types = extractor.getSupportedMimeTypes()
if mime_types:
formats["mime_types"][key] = mime_types
# Add fallback extractor info
if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
return formats
class ChunkerRegistry:

View file

@ -51,7 +51,24 @@ async def process_documents_and_generate_summary():
return False
# Find all supported document files
supported_extensions = ["*.pdf", "*.jpg", "*.jpeg", "*.png", "*.gif", "*.docx", "*.xlsx", "*.pptx", "*.ppt", "*.txt", "*.md", "*.html", "*.csv"]
supported_extensions = [
# Document formats
"*.pdf", "*.docx", "*.xlsx", "*.pptx", "*.ppt",
# Image formats
"*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.bmp", "*.tiff",
# Text and code files
"*.txt", "*.md", "*.log", "*.rtf", "*.tex", "*.rst", "*.adoc", "*.org", "*.pod",
"*.java", "*.js", "*.jsx", "*.ts", "*.tsx", "*.py", "*.rb", "*.go", "*.rs", "*.cpp", "*.c", "*.h", "*.hpp", "*.cc", "*.cxx",
"*.cs", "*.php", "*.swift", "*.kt", "*.scala", "*.clj", "*.hs", "*.ml", "*.fs", "*.vb", "*.dart", "*.r", "*.m", "*.pl", "*.sh",
"*.html", "*.htm", "*.css", "*.scss", "*.sass", "*.less", "*.vue", "*.svelte",
"*.config", "*.ini", "*.cfg", "*.conf", "*.properties", "*.yaml", "*.yml", "*.toml", "*.json", "*.xml",
"*.bat", "*.ps1", "*.psm1", "*.psd1", "*.vbs", "*.wsf", "*.cmd", "*.com",
"*.csv", "*.tsv", "*.tab", "*.dat", "*.data",
"*.man", "*.1", "*.2", "*.3", "*.4", "*.5", "*.6", "*.7", "*.8", "*.9", "*.n", "*.l", "*.m", "*.r", "*.t", "*.x", "*.y", "*.z",
"*.diff", "*.patch", "*.gitignore", "*.dockerignore", "*.editorconfig", "*.gitattributes",
"*.env", "*.env.local", "*.env.development", "*.env.production", "*.env.test",
"*.lock", "*.lockb", "*.lockfile", "*.pkg-lock", "*.yarn-lock"
]
document_files = []
for ext in supported_extensions:
document_files.extend(list(testdata_path.glob(ext)))
@ -164,6 +181,8 @@ async def process_documents_and_generate_summary():
mime_type = "text/html"
elif doc_file.suffix.lower() == '.csv':
mime_type = "text/csv"
elif doc_file.suffix.lower() == '.json':
mime_type = "application/json"
elif doc_file.suffix.lower() in ['.txt', '.md']:
mime_type = "text/plain"
@ -199,7 +218,7 @@ async def process_documents_and_generate_summary():
# Run a single end-to-end test to avoid the loop issue
logger.info("🧪 Running single end-to-end test...")
userPrompt = "Analyze these documents and create a comprehensive summary for all input documents, each input document in a separate chapter summarized in 10-20 sentences."
userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
# userPrompt = "Analyze these documents and create a fitting image for the content"
@ -215,8 +234,8 @@ async def process_documents_and_generate_summary():
prompt=userPrompt,
documents=documents,
options=ai_options,
outputFormat="docx",
title="Formulaire"
outputFormat="txt",
title="Kunden und Use Cases"
)
logger.info(f"✅ End-to-end test completed successfully")

117
test_extractor_formats.py Normal file
View file

@ -0,0 +1,117 @@
#!/usr/bin/env python3
"""
Test script to demonstrate enhanced extractor format support.
Shows all supported file extensions and MIME types for each extractor.
"""
import sys
import os
from pathlib import Path
# Add the gateway module to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
from modules.services.serviceExtraction.subRegistry import ExtractorRegistry
def test_extractor_formats():
"""Test and display all supported formats from extractors."""
print("🔍 Testing Plug-and-Play Extractor System")
print("=" * 60)
# Create registry
registry = ExtractorRegistry()
# Get all supported formats
formats = registry.getAllSupportedFormats()
print("\n📋 Supported File Extensions by Extractor:")
print("-" * 50)
for extractor_name, extensions in formats["extensions"].items():
if extensions:
print(f" {extractor_name:20}{', '.join(extensions)}")
else:
print(f" {extractor_name:20} → (all extensions - fallback)")
print("\n📋 Supported MIME Types by Extractor:")
print("-" * 50)
for extractor_name, mime_types in formats["mime_types"].items():
if mime_types:
print(f" {extractor_name:20}{', '.join(mime_types)}")
else:
print(f" {extractor_name:20} → (all MIME types - fallback)")
# Test individual extractors
print("\n🧪 Testing Individual Extractors:")
print("-" * 50)
# Get all registered extractors
for key, extractor in registry._map.items():
if hasattr(extractor, 'getSupportedExtensions') and hasattr(extractor, 'getSupportedMimeTypes'):
extensions = extractor.getSupportedExtensions()
mime_types = extractor.getSupportedMimeTypes()
print(f"\n {extractor.__class__.__name__}:")
print(f" Extensions: {extensions}")
print(f" MIME Types: {mime_types}")
# Test detection with various file types
print("\n🔬 Testing File Detection:")
print("-" * 50)
test_files = [
# Document formats
("document.pdf", "application/pdf"),
("spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
("presentation.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
("document.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
# Text and code files
("readme.txt", "text/plain"),
("readme.md", "text/markdown"),
("app.log", "text/plain"),
("Main.java", "text/x-java-source"),
("script.js", "text/javascript"),
("component.tsx", "text/typescript"),
("main.py", "text/x-python"),
("config.yaml", "text/x-yaml"),
("package.json", "application/json"),
("data.csv", "text/csv"),
("config.xml", "application/xml"),
("webpage.html", "text/html"),
("styles.css", "text/css"),
("script.sh", "text/x-sh"),
("Dockerfile", "text/plain"),
(".gitignore", "text/plain"),
("app.config", "text/plain"),
("database.sql", "text/x-sql"),
("schema.ddl", "application/sql"),
# Images
("image.png", "image/png"),
("photo.jpg", "image/jpeg"),
# Unknown
("unknown.xyz", "application/octet-stream")
]
for filename, mime_type in test_files:
extractor = registry.resolve(mime_type, filename)
if extractor:
print(f" {filename:25} ({mime_type:50}) → {extractor.__class__.__name__}")
else:
print(f" {filename:25} ({mime_type:50}) → No extractor found")
print("\n✅ Plug-and-Play extractor system test completed!")
print("\nKey improvements:")
print(" • 🔌 TRUE PLUG-AND-PLAY: Just add extractor file, it auto-registers!")
print(" • 📋 No more manual registration of file types")
print(" • 🔍 Auto-discovery scans extractors directory")
print(" • 📝 Each extractor declares its own supported formats")
print(" • 🚀 Easy to add new file types - just create new extractor")
print(" • 🧹 Clean, maintainable code with no redundancy")
print("\nTo add a new file type:")
print(" 1. Create extractorXyz.py in extractors/ directory")
print(" 2. Implement Extractor interface with getSupportedExtensions()")
print(" 3. That's it! No registry changes needed!")
if __name__ == "__main__":
test_extractor_formats()