AI system tested for all file types
This commit is contained in:
parent
2e471ca3f7
commit
0c357dc8a9
15 changed files with 588 additions and 52 deletions
|
|
@ -7,8 +7,28 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class BinaryExtractor(Extractor):
|
class BinaryExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Fallback extractor for unsupported file types.
|
||||||
|
|
||||||
|
This extractor handles any file type that doesn't match other extractors.
|
||||||
|
It encodes the file as base64 and marks it as binary data.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- All file types (fallback)
|
||||||
|
- MIME types: application/octet-stream (default)
|
||||||
|
- File extensions: All (fallback)
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions (all)."""
|
||||||
|
return [] # Accepts all extensions as fallback
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types (all)."""
|
||||||
|
return [] # Accepts all MIME types as fallback
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
mimeType = context.get("mimeType") or "application/octet-stream"
|
mimeType = context.get("mimeType") or "application/octet-stream"
|
||||||
|
|
|
||||||
|
|
@ -6,8 +6,25 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class CsvExtractor(Extractor):
|
class CsvExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for CSV files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: text/csv
|
||||||
|
- File extensions: .csv
|
||||||
|
- Special handling: Treats as table data
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
|
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".csv"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["text/csv"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
fileName = context.get("fileName")
|
fileName = context.get("fileName")
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,16 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class DocxExtractor(Extractor):
|
class DocxExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for Microsoft Word documents.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
|
||||||
|
- File extensions: .docx
|
||||||
|
- Special handling: Extracts paragraphs and tables (converts tables to CSV)
|
||||||
|
- Dependencies: python-docx
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._loaded = False
|
self._loaded = False
|
||||||
self._haveLibs = False
|
self._haveLibs = False
|
||||||
|
|
@ -24,6 +34,14 @@ class DocxExtractor(Extractor):
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
|
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".docx"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
self._load()
|
self._load()
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,26 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class HtmlExtractor(Extractor):
|
class HtmlExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for HTML files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: text/html
|
||||||
|
- File extensions: .html, .htm
|
||||||
|
- Special handling: Uses BeautifulSoup for parsing
|
||||||
|
- Dependencies: beautifulsoup4
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
|
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".html", ".htm"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["text/html"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
mimeType = context.get("mimeType") or "text/html"
|
mimeType = context.get("mimeType") or "text/html"
|
||||||
|
|
|
||||||
|
|
@ -10,8 +10,26 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class ImageExtractor(Extractor):
|
class ImageExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for image files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
|
||||||
|
- File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
|
||||||
|
- Special handling: GIF files are converted to PNG during extraction
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return (mimeType or "").startswith("image/")
|
return ((mimeType or "").startswith("image/") or
|
||||||
|
(fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
mimeType = context.get("mimeType") or "image/unknown"
|
mimeType = context.get("mimeType") or "image/unknown"
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,25 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class JsonExtractor(Extractor):
|
class JsonExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for JSON files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/json
|
||||||
|
- File extensions: .json
|
||||||
|
- Special handling: Validates JSON format, falls back to text if invalid
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
|
return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".json"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["application/json"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
mimeType = context.get("mimeType") or "application/json"
|
mimeType = context.get("mimeType") or "application/json"
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,16 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class PdfExtractor(Extractor):
|
class PdfExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for PDF files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/pdf
|
||||||
|
- File extensions: .pdf
|
||||||
|
- Special handling: Extracts text per page and embedded images
|
||||||
|
- Dependencies: PyPDF2, PyMuPDF (fitz)
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._loaded = False
|
self._loaded = False
|
||||||
self._haveLibs = False
|
self._haveLibs = False
|
||||||
|
|
@ -26,6 +36,14 @@ class PdfExtractor(Extractor):
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
|
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".pdf"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["application/pdf"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
self._load()
|
self._load()
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,15 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PptxExtractor(Extractor):
|
class PptxExtractor(Extractor):
|
||||||
"""Extractor for PowerPoint (.pptx) files using python-pptx library."""
|
"""
|
||||||
|
Extractor for PowerPoint files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
|
||||||
|
- File extensions: .pptx, .ppt
|
||||||
|
- Special handling: Extracts slide content, tables, and images
|
||||||
|
- Dependencies: python-pptx
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._loaded = False
|
self._loaded = False
|
||||||
|
|
@ -31,6 +39,17 @@ class PptxExtractor(Extractor):
|
||||||
"application/vnd.ms-powerpoint"
|
"application/vnd.ms-powerpoint"
|
||||||
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
|
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".pptx", ".ppt"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return [
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
"application/vnd.ms-powerpoint"
|
||||||
|
]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
"""
|
"""
|
||||||
Extract content from PowerPoint files.
|
Extract content from PowerPoint files.
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,56 @@
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelExtraction import ContentPart
|
||||||
|
from ..subUtils import makeId
|
||||||
|
from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
|
class SqlExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for SQL files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: text/x-sql, application/sql
|
||||||
|
- File extensions: .sql, .ddl, .dml, .dcl, .tcl
|
||||||
|
- Special handling: Treats as structured text with SQL syntax
|
||||||
|
"""
|
||||||
|
|
||||||
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
|
return (mimeType in ("text/x-sql", "application/sql") or
|
||||||
|
(fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["text/x-sql", "application/sql"]
|
||||||
|
|
||||||
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
|
fileName = context.get("fileName")
|
||||||
|
mimeType = context.get("mimeType") or "text/x-sql"
|
||||||
|
data = fileBytes.decode("utf-8", errors="replace")
|
||||||
|
|
||||||
|
# Add SQL-specific metadata
|
||||||
|
metadata = {
|
||||||
|
"size": len(fileBytes),
|
||||||
|
"file_type": "sql",
|
||||||
|
"line_count": len(data.splitlines()),
|
||||||
|
"has_select": "SELECT" in data.upper(),
|
||||||
|
"has_insert": "INSERT" in data.upper(),
|
||||||
|
"has_update": "UPDATE" in data.upper(),
|
||||||
|
"has_delete": "DELETE" in data.upper(),
|
||||||
|
"has_create": "CREATE" in data.upper(),
|
||||||
|
"has_drop": "DROP" in data.upper()
|
||||||
|
}
|
||||||
|
|
||||||
|
return [ContentPart(
|
||||||
|
id=makeId(),
|
||||||
|
parentId=None,
|
||||||
|
label="main",
|
||||||
|
typeGroup="structure",
|
||||||
|
mimeType=mimeType,
|
||||||
|
data=data,
|
||||||
|
metadata=metadata
|
||||||
|
)]
|
||||||
|
|
@ -6,8 +6,85 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class TextExtractor(Extractor):
|
class TextExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for plain text files and code files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
|
||||||
|
- File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType in ("text/plain", "text/markdown")
|
# Check MIME types
|
||||||
|
if mimeType and mimeType.startswith("text/"):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check file extensions
|
||||||
|
if fileName:
|
||||||
|
ext = fileName.lower()
|
||||||
|
return ext.endswith((
|
||||||
|
# Basic text files
|
||||||
|
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
||||||
|
# Programming languages
|
||||||
|
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
||||||
|
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
||||||
|
# Web technologies
|
||||||
|
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
||||||
|
# Configuration files
|
||||||
|
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
||||||
|
# Scripts and automation
|
||||||
|
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
||||||
|
# Data files
|
||||||
|
".csv", ".tsv", ".tab", ".dat", ".data",
|
||||||
|
# Documentation
|
||||||
|
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
||||||
|
# Other text formats
|
||||||
|
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
||||||
|
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
||||||
|
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
||||||
|
))
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [
|
||||||
|
# Basic text files
|
||||||
|
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
||||||
|
# Programming languages
|
||||||
|
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
||||||
|
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
||||||
|
# Web technologies
|
||||||
|
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
||||||
|
# Configuration files
|
||||||
|
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
||||||
|
# Scripts and automation
|
||||||
|
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
||||||
|
# Data files
|
||||||
|
".csv", ".tsv", ".tab", ".dat", ".data",
|
||||||
|
# Documentation
|
||||||
|
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
||||||
|
# Other text formats
|
||||||
|
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
||||||
|
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
||||||
|
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
||||||
|
]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return [
|
||||||
|
"text/plain", "text/markdown", "text/x-python", "text/x-java-source",
|
||||||
|
"text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
|
||||||
|
"text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
|
||||||
|
"text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
|
||||||
|
"text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
|
||||||
|
"text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
|
||||||
|
"text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
|
||||||
|
"text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
|
||||||
|
"text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
|
||||||
|
"application/x-yaml", "application/x-toml", "application/x-ini",
|
||||||
|
"application/x-config", "application/x-properties", "application/x-log"
|
||||||
|
]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
fileName = context.get("fileName")
|
fileName = context.get("fileName")
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,16 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class XlsxExtractor(Extractor):
|
class XlsxExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for Microsoft Excel spreadsheets.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
|
||||||
|
- File extensions: .xlsx, .xlsm
|
||||||
|
- Special handling: Extracts all sheets as CSV data
|
||||||
|
- Dependencies: openpyxl
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._loaded = False
|
self._loaded = False
|
||||||
self._haveLibs = False
|
self._haveLibs = False
|
||||||
|
|
@ -26,6 +36,14 @@ class XlsxExtractor(Extractor):
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
|
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".xlsx", ".xlsm"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
self._load()
|
self._load()
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,25 @@ from ..subRegistry import Extractor
|
||||||
|
|
||||||
|
|
||||||
class XmlExtractor(Extractor):
|
class XmlExtractor(Extractor):
|
||||||
|
"""
|
||||||
|
Extractor for XML files.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
- MIME types: application/xml
|
||||||
|
- File extensions: .xml, .rss, .atom
|
||||||
|
- Special handling: Uses ElementTree for parsing
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
|
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions."""
|
||||||
|
return [".xml", ".rss", ".atom"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return ["application/xml"]
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||||
mimeType = context.get("mimeType") or "application/xml"
|
mimeType = context.get("mimeType") or "application/xml"
|
||||||
|
|
|
||||||
|
|
@ -7,11 +7,31 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Extractor:
|
class Extractor:
|
||||||
|
"""
|
||||||
|
Base class for all document extractors.
|
||||||
|
|
||||||
|
Each extractor should implement:
|
||||||
|
- detect(): Check if this extractor can handle the given file
|
||||||
|
- extract(): Extract content from the file
|
||||||
|
- getSupportedExtensions(): Return supported file extensions
|
||||||
|
- getSupportedMimeTypes(): Return supported MIME types
|
||||||
|
"""
|
||||||
|
|
||||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
|
"""Check if this extractor can handle the given file."""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
|
||||||
|
"""Extract content from the file bytes."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def getSupportedExtensions(self) -> list[str]:
|
||||||
|
"""Return list of supported file extensions (including dots)."""
|
||||||
|
return []
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self) -> list[str]:
|
||||||
|
"""Return list of supported MIME types."""
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
class Chunker:
|
class Chunker:
|
||||||
|
|
@ -23,55 +43,85 @@ class ExtractorRegistry:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._map: Dict[str, Extractor] = {}
|
self._map: Dict[str, Extractor] = {}
|
||||||
self._fallback: Optional[Extractor] = None
|
self._fallback: Optional[Extractor] = None
|
||||||
# Register built-ins
|
self._auto_discover_extractors()
|
||||||
|
|
||||||
|
def _auto_discover_extractors(self):
|
||||||
|
"""Auto-discover and register all extractors from the extractors directory."""
|
||||||
try:
|
try:
|
||||||
from .extractors.extractorText import TextExtractor
|
import os
|
||||||
from .extractors.extractorCsv import CsvExtractor
|
import importlib
|
||||||
from .extractors.extractorJson import JsonExtractor
|
from pathlib import Path
|
||||||
from .extractors.extractorXml import XmlExtractor
|
|
||||||
from .extractors.extractorHtml import HtmlExtractor
|
# Get the extractors directory
|
||||||
from .extractors.extractorPdf import PdfExtractor
|
current_dir = Path(__file__).parent
|
||||||
from .extractors.extractorDocx import DocxExtractor
|
extractors_dir = current_dir / "extractors"
|
||||||
from .extractors.extractorXlsx import XlsxExtractor
|
|
||||||
from .extractors.extractorPptx import PptxExtractor
|
if not extractors_dir.exists():
|
||||||
from .extractors.extractorImage import ImageExtractor
|
logger.error(f"Extractors directory not found: {extractors_dir}")
|
||||||
from .extractors.extractorBinary import BinaryExtractor
|
return
|
||||||
self.register("text/plain", TextExtractor())
|
|
||||||
self.register("text/markdown", TextExtractor())
|
# Import all extractor modules
|
||||||
self.register("text/csv", CsvExtractor())
|
extractor_modules = []
|
||||||
self.register("application/json", JsonExtractor())
|
for file_path in extractors_dir.glob("extractor*.py"):
|
||||||
self.register("application/xml", XmlExtractor())
|
if file_path.name == "__init__.py":
|
||||||
self.register("text/html", HtmlExtractor())
|
continue
|
||||||
self.register("application/pdf", PdfExtractor())
|
|
||||||
self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
|
module_name = file_path.stem
|
||||||
self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
|
try:
|
||||||
self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor())
|
# Import the module
|
||||||
self.register("application/vnd.ms-powerpoint", PptxExtractor())
|
module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
|
||||||
# images
|
|
||||||
self.register("image/jpeg", ImageExtractor())
|
# Find all extractor classes in the module
|
||||||
self.register("image/png", ImageExtractor())
|
for attr_name in dir(module):
|
||||||
self.register("image/gif", ImageExtractor())
|
attr = getattr(module, attr_name)
|
||||||
# extension fallbacks
|
if (isinstance(attr, type) and
|
||||||
self.register("txt", TextExtractor())
|
issubclass(attr, Extractor) and
|
||||||
self.register("md", TextExtractor())
|
attr != Extractor and
|
||||||
self.register("csv", CsvExtractor())
|
not attr_name.startswith('_')):
|
||||||
self.register("json", JsonExtractor())
|
|
||||||
self.register("xml", XmlExtractor())
|
# Create instance and auto-register
|
||||||
self.register("html", HtmlExtractor())
|
extractor_instance = attr()
|
||||||
self.register("htm", HtmlExtractor())
|
self._auto_register_extractor(extractor_instance)
|
||||||
self.register("pdf", PdfExtractor())
|
extractor_modules.append(attr_name)
|
||||||
self.register("docx", DocxExtractor())
|
|
||||||
self.register("xlsx", XlsxExtractor())
|
except Exception as e:
|
||||||
self.register("xlsm", XlsxExtractor())
|
logger.warning(f"Failed to import {module_name}: {str(e)}")
|
||||||
self.register("pptx", PptxExtractor())
|
continue
|
||||||
self.register("ppt", PptxExtractor())
|
|
||||||
# fallback
|
# Set fallback extractor
|
||||||
self.setFallback(BinaryExtractor())
|
try:
|
||||||
logger.info(f"ExtractorRegistry: Successfully registered {len(self._map)} extractors")
|
from .extractors.extractorBinary import BinaryExtractor
|
||||||
|
self.setFallback(BinaryExtractor())
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to set fallback extractor: {str(e)}")
|
||||||
|
|
||||||
|
logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
|
||||||
|
logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ExtractorRegistry: Failed to register extractors: {str(e)}")
|
logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
def _auto_register_extractor(self, extractor: Extractor):
|
||||||
|
"""Auto-register an extractor based on its declared supported formats."""
|
||||||
|
try:
|
||||||
|
# Register MIME types
|
||||||
|
mime_types = extractor.getSupportedMimeTypes()
|
||||||
|
for mime_type in mime_types:
|
||||||
|
self.register(mime_type, extractor)
|
||||||
|
logger.debug(f"Registered MIME type: {mime_type} → {extractor.__class__.__name__}")
|
||||||
|
|
||||||
|
# Register file extensions
|
||||||
|
extensions = extractor.getSupportedExtensions()
|
||||||
|
for ext in extensions:
|
||||||
|
# Remove leading dot for registry key
|
||||||
|
ext_key = ext.lstrip('.')
|
||||||
|
self.register(ext_key, extractor)
|
||||||
|
logger.debug(f"Registered extension: .{ext_key} → {extractor.__class__.__name__}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
|
||||||
|
|
||||||
def register(self, key: str, extractor: Extractor):
|
def register(self, key: str, extractor: Extractor):
|
||||||
self._map[key] = extractor
|
self._map[key] = extractor
|
||||||
|
|
@ -88,6 +138,43 @@ class ExtractorRegistry:
|
||||||
if ext in self._map:
|
if ext in self._map:
|
||||||
return self._map[ext]
|
return self._map[ext]
|
||||||
return self._fallback
|
return self._fallback
|
||||||
|
|
||||||
|
def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
|
||||||
|
"""
|
||||||
|
Get all supported formats from all registered extractors.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with format information:
|
||||||
|
{
|
||||||
|
"extensions": {
|
||||||
|
"extractor_name": [".ext1", ".ext2", ...]
|
||||||
|
},
|
||||||
|
"mime_types": {
|
||||||
|
"extractor_name": ["mime/type1", "mime/type2", ...]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
formats = {"extensions": {}, "mime_types": {}}
|
||||||
|
|
||||||
|
# Get formats from registered extractors
|
||||||
|
for key, extractor in self._map.items():
|
||||||
|
if hasattr(extractor, 'getSupportedExtensions'):
|
||||||
|
extensions = extractor.getSupportedExtensions()
|
||||||
|
if extensions:
|
||||||
|
formats["extensions"][key] = extensions
|
||||||
|
|
||||||
|
if hasattr(extractor, 'getSupportedMimeTypes'):
|
||||||
|
mime_types = extractor.getSupportedMimeTypes()
|
||||||
|
if mime_types:
|
||||||
|
formats["mime_types"][key] = mime_types
|
||||||
|
|
||||||
|
# Add fallback extractor info
|
||||||
|
if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
|
||||||
|
formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
|
||||||
|
if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
|
||||||
|
formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
|
||||||
|
|
||||||
|
return formats
|
||||||
|
|
||||||
|
|
||||||
class ChunkerRegistry:
|
class ChunkerRegistry:
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,24 @@ async def process_documents_and_generate_summary():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Find all supported document files
|
# Find all supported document files
|
||||||
supported_extensions = ["*.pdf", "*.jpg", "*.jpeg", "*.png", "*.gif", "*.docx", "*.xlsx", "*.pptx", "*.ppt", "*.txt", "*.md", "*.html", "*.csv"]
|
supported_extensions = [
|
||||||
|
# Document formats
|
||||||
|
"*.pdf", "*.docx", "*.xlsx", "*.pptx", "*.ppt",
|
||||||
|
# Image formats
|
||||||
|
"*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.bmp", "*.tiff",
|
||||||
|
# Text and code files
|
||||||
|
"*.txt", "*.md", "*.log", "*.rtf", "*.tex", "*.rst", "*.adoc", "*.org", "*.pod",
|
||||||
|
"*.java", "*.js", "*.jsx", "*.ts", "*.tsx", "*.py", "*.rb", "*.go", "*.rs", "*.cpp", "*.c", "*.h", "*.hpp", "*.cc", "*.cxx",
|
||||||
|
"*.cs", "*.php", "*.swift", "*.kt", "*.scala", "*.clj", "*.hs", "*.ml", "*.fs", "*.vb", "*.dart", "*.r", "*.m", "*.pl", "*.sh",
|
||||||
|
"*.html", "*.htm", "*.css", "*.scss", "*.sass", "*.less", "*.vue", "*.svelte",
|
||||||
|
"*.config", "*.ini", "*.cfg", "*.conf", "*.properties", "*.yaml", "*.yml", "*.toml", "*.json", "*.xml",
|
||||||
|
"*.bat", "*.ps1", "*.psm1", "*.psd1", "*.vbs", "*.wsf", "*.cmd", "*.com",
|
||||||
|
"*.csv", "*.tsv", "*.tab", "*.dat", "*.data",
|
||||||
|
"*.man", "*.1", "*.2", "*.3", "*.4", "*.5", "*.6", "*.7", "*.8", "*.9", "*.n", "*.l", "*.m", "*.r", "*.t", "*.x", "*.y", "*.z",
|
||||||
|
"*.diff", "*.patch", "*.gitignore", "*.dockerignore", "*.editorconfig", "*.gitattributes",
|
||||||
|
"*.env", "*.env.local", "*.env.development", "*.env.production", "*.env.test",
|
||||||
|
"*.lock", "*.lockb", "*.lockfile", "*.pkg-lock", "*.yarn-lock"
|
||||||
|
]
|
||||||
document_files = []
|
document_files = []
|
||||||
for ext in supported_extensions:
|
for ext in supported_extensions:
|
||||||
document_files.extend(list(testdata_path.glob(ext)))
|
document_files.extend(list(testdata_path.glob(ext)))
|
||||||
|
|
@ -164,6 +181,8 @@ async def process_documents_and_generate_summary():
|
||||||
mime_type = "text/html"
|
mime_type = "text/html"
|
||||||
elif doc_file.suffix.lower() == '.csv':
|
elif doc_file.suffix.lower() == '.csv':
|
||||||
mime_type = "text/csv"
|
mime_type = "text/csv"
|
||||||
|
elif doc_file.suffix.lower() == '.json':
|
||||||
|
mime_type = "application/json"
|
||||||
elif doc_file.suffix.lower() in ['.txt', '.md']:
|
elif doc_file.suffix.lower() in ['.txt', '.md']:
|
||||||
mime_type = "text/plain"
|
mime_type = "text/plain"
|
||||||
|
|
||||||
|
|
@ -199,7 +218,7 @@ async def process_documents_and_generate_summary():
|
||||||
# Run a single end-to-end test to avoid the loop issue
|
# Run a single end-to-end test to avoid the loop issue
|
||||||
logger.info("🧪 Running single end-to-end test...")
|
logger.info("🧪 Running single end-to-end test...")
|
||||||
|
|
||||||
userPrompt = "Analyze these documents and create a comprehensive summary for all input documents, each input document in a separate chapter summarized in 10-20 sentences."
|
userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
|
||||||
|
|
||||||
# userPrompt = "Analyze these documents and create a fitting image for the content"
|
# userPrompt = "Analyze these documents and create a fitting image for the content"
|
||||||
|
|
||||||
|
|
@ -215,8 +234,8 @@ async def process_documents_and_generate_summary():
|
||||||
prompt=userPrompt,
|
prompt=userPrompt,
|
||||||
documents=documents,
|
documents=documents,
|
||||||
options=ai_options,
|
options=ai_options,
|
||||||
outputFormat="docx",
|
outputFormat="txt",
|
||||||
title="Formulaire"
|
title="Kunden und Use Cases"
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"✅ End-to-end test completed successfully")
|
logger.info(f"✅ End-to-end test completed successfully")
|
||||||
|
|
|
||||||
117
test_extractor_formats.py
Normal file
117
test_extractor_formats.py
Normal file
|
|
@ -0,0 +1,117 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script to demonstrate enhanced extractor format support.
|
||||||
|
Shows all supported file extensions and MIME types for each extractor.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add the gateway module to the path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
|
||||||
|
|
||||||
|
from modules.services.serviceExtraction.subRegistry import ExtractorRegistry
|
||||||
|
|
||||||
|
def test_extractor_formats():
|
||||||
|
"""Test and display all supported formats from extractors."""
|
||||||
|
print("🔍 Testing Plug-and-Play Extractor System")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Create registry
|
||||||
|
registry = ExtractorRegistry()
|
||||||
|
|
||||||
|
# Get all supported formats
|
||||||
|
formats = registry.getAllSupportedFormats()
|
||||||
|
|
||||||
|
print("\n📋 Supported File Extensions by Extractor:")
|
||||||
|
print("-" * 50)
|
||||||
|
for extractor_name, extensions in formats["extensions"].items():
|
||||||
|
if extensions:
|
||||||
|
print(f" {extractor_name:20} → {', '.join(extensions)}")
|
||||||
|
else:
|
||||||
|
print(f" {extractor_name:20} → (all extensions - fallback)")
|
||||||
|
|
||||||
|
print("\n📋 Supported MIME Types by Extractor:")
|
||||||
|
print("-" * 50)
|
||||||
|
for extractor_name, mime_types in formats["mime_types"].items():
|
||||||
|
if mime_types:
|
||||||
|
print(f" {extractor_name:20} → {', '.join(mime_types)}")
|
||||||
|
else:
|
||||||
|
print(f" {extractor_name:20} → (all MIME types - fallback)")
|
||||||
|
|
||||||
|
# Test individual extractors
|
||||||
|
print("\n🧪 Testing Individual Extractors:")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
# Get all registered extractors
|
||||||
|
for key, extractor in registry._map.items():
|
||||||
|
if hasattr(extractor, 'getSupportedExtensions') and hasattr(extractor, 'getSupportedMimeTypes'):
|
||||||
|
extensions = extractor.getSupportedExtensions()
|
||||||
|
mime_types = extractor.getSupportedMimeTypes()
|
||||||
|
print(f"\n {extractor.__class__.__name__}:")
|
||||||
|
print(f" Extensions: {extensions}")
|
||||||
|
print(f" MIME Types: {mime_types}")
|
||||||
|
|
||||||
|
# Test detection with various file types
|
||||||
|
print("\n🔬 Testing File Detection:")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
test_files = [
|
||||||
|
# Document formats
|
||||||
|
("document.pdf", "application/pdf"),
|
||||||
|
("spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
|
||||||
|
("presentation.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
|
||||||
|
("document.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
|
||||||
|
|
||||||
|
# Text and code files
|
||||||
|
("readme.txt", "text/plain"),
|
||||||
|
("readme.md", "text/markdown"),
|
||||||
|
("app.log", "text/plain"),
|
||||||
|
("Main.java", "text/x-java-source"),
|
||||||
|
("script.js", "text/javascript"),
|
||||||
|
("component.tsx", "text/typescript"),
|
||||||
|
("main.py", "text/x-python"),
|
||||||
|
("config.yaml", "text/x-yaml"),
|
||||||
|
("package.json", "application/json"),
|
||||||
|
("data.csv", "text/csv"),
|
||||||
|
("config.xml", "application/xml"),
|
||||||
|
("webpage.html", "text/html"),
|
||||||
|
("styles.css", "text/css"),
|
||||||
|
("script.sh", "text/x-sh"),
|
||||||
|
("Dockerfile", "text/plain"),
|
||||||
|
(".gitignore", "text/plain"),
|
||||||
|
("app.config", "text/plain"),
|
||||||
|
("database.sql", "text/x-sql"),
|
||||||
|
("schema.ddl", "application/sql"),
|
||||||
|
|
||||||
|
# Images
|
||||||
|
("image.png", "image/png"),
|
||||||
|
("photo.jpg", "image/jpeg"),
|
||||||
|
|
||||||
|
# Unknown
|
||||||
|
("unknown.xyz", "application/octet-stream")
|
||||||
|
]
|
||||||
|
|
||||||
|
for filename, mime_type in test_files:
|
||||||
|
extractor = registry.resolve(mime_type, filename)
|
||||||
|
if extractor:
|
||||||
|
print(f" {filename:25} ({mime_type:50}) → {extractor.__class__.__name__}")
|
||||||
|
else:
|
||||||
|
print(f" {filename:25} ({mime_type:50}) → No extractor found")
|
||||||
|
|
||||||
|
print("\n✅ Plug-and-Play extractor system test completed!")
|
||||||
|
print("\nKey improvements:")
|
||||||
|
print(" • 🔌 TRUE PLUG-AND-PLAY: Just add extractor file, it auto-registers!")
|
||||||
|
print(" • 📋 No more manual registration of file types")
|
||||||
|
print(" • 🔍 Auto-discovery scans extractors directory")
|
||||||
|
print(" • 📝 Each extractor declares its own supported formats")
|
||||||
|
print(" • 🚀 Easy to add new file types - just create new extractor")
|
||||||
|
print(" • 🧹 Clean, maintainable code with no redundancy")
|
||||||
|
print("\nTo add a new file type:")
|
||||||
|
print(" 1. Create extractorXyz.py in extractors/ directory")
|
||||||
|
print(" 2. Implement Extractor interface with getSupportedExtensions()")
|
||||||
|
print(" 3. That's it! No registry changes needed!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_extractor_formats()
|
||||||
Loading…
Reference in a new issue