#!/usr/bin/env python3 """ Test script to demonstrate enhanced extractor format support. Shows all supported file extensions and MIME types for each extractor. """ import sys import os from pathlib import Path # Add the gateway module to the path sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules')) from modules.services.serviceExtraction.subRegistry import ExtractorRegistry def test_extractor_formats(): """Test and display all supported formats from extractors.""" print("๐Ÿ” Testing Plug-and-Play Extractor System") print("=" * 60) # Create registry registry = ExtractorRegistry() # Get all supported formats formats = registry.getAllSupportedFormats() print("\n๐Ÿ“‹ Supported File Extensions by Extractor:") print("-" * 50) for extractor_name, extensions in formats["extensions"].items(): if extensions: print(f" {extractor_name:20} โ†’ {', '.join(extensions)}") else: print(f" {extractor_name:20} โ†’ (all extensions - fallback)") print("\n๐Ÿ“‹ Supported MIME Types by Extractor:") print("-" * 50) for extractor_name, mime_types in formats["mime_types"].items(): if mime_types: print(f" {extractor_name:20} โ†’ {', '.join(mime_types)}") else: print(f" {extractor_name:20} โ†’ (all MIME types - fallback)") # Test individual extractors print("\n๐Ÿงช Testing Individual Extractors:") print("-" * 50) # Get all registered extractors for key, extractor in registry._map.items(): if hasattr(extractor, 'getSupportedExtensions') and hasattr(extractor, 'getSupportedMimeTypes'): extensions = extractor.getSupportedExtensions() mime_types = extractor.getSupportedMimeTypes() print(f"\n {extractor.__class__.__name__}:") print(f" Extensions: {extensions}") print(f" MIME Types: {mime_types}") # Test detection with various file types print("\n๐Ÿ”ฌ Testing File Detection:") print("-" * 50) test_files = [ # Document formats ("document.pdf", "application/pdf"), ("spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), ("presentation.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"), ("document.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), # Text and code files ("readme.txt", "text/plain"), ("readme.md", "text/markdown"), ("app.log", "text/plain"), ("Main.java", "text/x-java-source"), ("script.js", "text/javascript"), ("component.tsx", "text/typescript"), ("main.py", "text/x-python"), ("config.yaml", "text/x-yaml"), ("package.json", "application/json"), ("data.csv", "text/csv"), ("config.xml", "application/xml"), ("webpage.html", "text/html"), ("styles.css", "text/css"), ("script.sh", "text/x-sh"), ("Dockerfile", "text/plain"), (".gitignore", "text/plain"), ("app.config", "text/plain"), ("database.sql", "text/x-sql"), ("schema.ddl", "application/sql"), # Images ("image.png", "image/png"), ("photo.jpg", "image/jpeg"), # Unknown ("unknown.xyz", "application/octet-stream") ] for filename, mime_type in test_files: extractor = registry.resolve(mime_type, filename) if extractor: print(f" {filename:25} ({mime_type:50}) โ†’ {extractor.__class__.__name__}") else: print(f" {filename:25} ({mime_type:50}) โ†’ No extractor found") print("\nโœ… Plug-and-Play extractor system test completed!") print("\nKey improvements:") print(" โ€ข ๐Ÿ”Œ TRUE PLUG-AND-PLAY: Just add extractor file, it auto-registers!") print(" โ€ข ๐Ÿ“‹ No more manual registration of file types") print(" โ€ข ๐Ÿ” Auto-discovery scans extractors directory") print(" โ€ข ๐Ÿ“ Each extractor declares its own supported formats") print(" โ€ข ๐Ÿš€ Easy to add new file types - just create new extractor") print(" โ€ข ๐Ÿงน Clean, maintainable code with no redundancy") print("\nTo add a new file type:") print(" 1. Create extractorXyz.py in extractors/ directory") print(" 2. Implement Extractor interface with getSupportedExtensions()") print(" 3. That's it! No registry changes needed!") if __name__ == "__main__": test_extractor_formats()