AI system tested for all file types
This commit is contained in:
parent
2e471ca3f7
commit
0c357dc8a9
15 changed files with 588 additions and 52 deletions
|
|
@ -7,9 +7,29 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class BinaryExtractor(Extractor):
|
||||
"""
|
||||
Fallback extractor for unsupported file types.
|
||||
|
||||
This extractor handles any file type that doesn't match other extractors.
|
||||
It encodes the file as base64 and marks it as binary data.
|
||||
|
||||
Supported formats:
|
||||
- All file types (fallback)
|
||||
- MIME types: application/octet-stream (default)
|
||||
- File extensions: All (fallback)
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return True
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions (all)."""
|
||||
return [] # Accepts all extensions as fallback
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types (all)."""
|
||||
return [] # Accepts all MIME types as fallback
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "application/octet-stream"
|
||||
return [ContentPart(
|
||||
|
|
|
|||
|
|
@ -6,9 +6,26 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class CsvExtractor(Extractor):
|
||||
"""
|
||||
Extractor for CSV files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/csv
|
||||
- File extensions: .csv
|
||||
- Special handling: Treats as table data
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".csv"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["text/csv"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
fileName = context.get("fileName")
|
||||
mimeType = context.get("mimeType") or "text/csv"
|
||||
|
|
|
|||
|
|
@ -7,6 +7,16 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class DocxExtractor(Extractor):
|
||||
"""
|
||||
Extractor for Microsoft Word documents.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
|
||||
- File extensions: .docx
|
||||
- Special handling: Extracts paragraphs and tables (converts tables to CSV)
|
||||
- Dependencies: python-docx
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
self._haveLibs = False
|
||||
|
|
@ -25,6 +35,14 @@ class DocxExtractor(Extractor):
|
|||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".docx"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
self._load()
|
||||
parts: List[ContentPart] = []
|
||||
|
|
|
|||
|
|
@ -7,9 +7,27 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class HtmlExtractor(Extractor):
|
||||
"""
|
||||
Extractor for HTML files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/html
|
||||
- File extensions: .html, .htm
|
||||
- Special handling: Uses BeautifulSoup for parsing
|
||||
- Dependencies: beautifulsoup4
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".html", ".htm"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["text/html"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "text/html"
|
||||
text = fileBytes.decode("utf-8", errors="replace")
|
||||
|
|
|
|||
|
|
@ -10,8 +10,26 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class ImageExtractor(Extractor):
|
||||
"""
|
||||
Extractor for image files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
|
||||
- File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
|
||||
- Special handling: GIF files are converted to PNG during extraction
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return (mimeType or "").startswith("image/")
|
||||
return ((mimeType or "").startswith("image/") or
|
||||
(fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "image/unknown"
|
||||
|
|
|
|||
|
|
@ -7,9 +7,26 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class JsonExtractor(Extractor):
|
||||
"""
|
||||
Extractor for JSON files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/json
|
||||
- File extensions: .json
|
||||
- Special handling: Validates JSON format, falls back to text if invalid
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".json"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/json"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "application/json"
|
||||
text = fileBytes.decode("utf-8", errors="replace")
|
||||
|
|
|
|||
|
|
@ -8,6 +8,16 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class PdfExtractor(Extractor):
|
||||
"""
|
||||
Extractor for PDF files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/pdf
|
||||
- File extensions: .pdf
|
||||
- Special handling: Extracts text per page and embedded images
|
||||
- Dependencies: PyPDF2, PyMuPDF (fitz)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
self._haveLibs = False
|
||||
|
|
@ -27,6 +37,14 @@ class PdfExtractor(Extractor):
|
|||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".pdf"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/pdf"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
self._load()
|
||||
parts: List[ContentPart] = []
|
||||
|
|
|
|||
|
|
@ -8,7 +8,15 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class PptxExtractor(Extractor):
|
||||
"""Extractor for PowerPoint (.pptx) files using python-pptx library."""
|
||||
"""
|
||||
Extractor for PowerPoint files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
|
||||
- File extensions: .pptx, .ppt
|
||||
- Special handling: Extracts slide content, tables, and images
|
||||
- Dependencies: python-pptx
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
|
|
@ -31,6 +39,17 @@ class PptxExtractor(Extractor):
|
|||
"application/vnd.ms-powerpoint"
|
||||
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".pptx", ".ppt"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/vnd.ms-powerpoint"
|
||||
]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
"""
|
||||
Extract content from PowerPoint files.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,56 @@
|
|||
from typing import Any, Dict, List
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subUtils import makeId
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class SqlExtractor(Extractor):
|
||||
"""
|
||||
Extractor for SQL files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/x-sql, application/sql
|
||||
- File extensions: .sql, .ddl, .dml, .dcl, .tcl
|
||||
- Special handling: Treats as structured text with SQL syntax
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return (mimeType in ("text/x-sql", "application/sql") or
|
||||
(fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["text/x-sql", "application/sql"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
fileName = context.get("fileName")
|
||||
mimeType = context.get("mimeType") or "text/x-sql"
|
||||
data = fileBytes.decode("utf-8", errors="replace")
|
||||
|
||||
# Add SQL-specific metadata
|
||||
metadata = {
|
||||
"size": len(fileBytes),
|
||||
"file_type": "sql",
|
||||
"line_count": len(data.splitlines()),
|
||||
"has_select": "SELECT" in data.upper(),
|
||||
"has_insert": "INSERT" in data.upper(),
|
||||
"has_update": "UPDATE" in data.upper(),
|
||||
"has_delete": "DELETE" in data.upper(),
|
||||
"has_create": "CREATE" in data.upper(),
|
||||
"has_drop": "DROP" in data.upper()
|
||||
}
|
||||
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="main",
|
||||
typeGroup="structure",
|
||||
mimeType=mimeType,
|
||||
data=data,
|
||||
metadata=metadata
|
||||
)]
|
||||
|
|
@ -6,8 +6,85 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class TextExtractor(Extractor):
|
||||
"""
|
||||
Extractor for plain text files and code files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
|
||||
- File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType in ("text/plain", "text/markdown")
|
||||
# Check MIME types
|
||||
if mimeType and mimeType.startswith("text/"):
|
||||
return True
|
||||
|
||||
# Check file extensions
|
||||
if fileName:
|
||||
ext = fileName.lower()
|
||||
return ext.endswith((
|
||||
# Basic text files
|
||||
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
||||
# Programming languages
|
||||
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
||||
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
||||
# Web technologies
|
||||
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
||||
# Configuration files
|
||||
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
||||
# Scripts and automation
|
||||
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
||||
# Data files
|
||||
".csv", ".tsv", ".tab", ".dat", ".data",
|
||||
# Documentation
|
||||
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
||||
# Other text formats
|
||||
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
||||
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
||||
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
||||
))
|
||||
|
||||
return False
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [
|
||||
# Basic text files
|
||||
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
||||
# Programming languages
|
||||
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
||||
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
||||
# Web technologies
|
||||
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
||||
# Configuration files
|
||||
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
||||
# Scripts and automation
|
||||
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
||||
# Data files
|
||||
".csv", ".tsv", ".tab", ".dat", ".data",
|
||||
# Documentation
|
||||
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
||||
# Other text formats
|
||||
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
||||
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
||||
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
||||
]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return [
|
||||
"text/plain", "text/markdown", "text/x-python", "text/x-java-source",
|
||||
"text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
|
||||
"text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
|
||||
"text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
|
||||
"text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
|
||||
"text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
|
||||
"text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
|
||||
"text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
|
||||
"text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
|
||||
"application/x-yaml", "application/x-toml", "application/x-ini",
|
||||
"application/x-config", "application/x-properties", "application/x-log"
|
||||
]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
fileName = context.get("fileName")
|
||||
|
|
|
|||
|
|
@ -8,6 +8,16 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class XlsxExtractor(Extractor):
|
||||
"""
|
||||
Extractor for Microsoft Excel spreadsheets.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
|
||||
- File extensions: .xlsx, .xlsm
|
||||
- Special handling: Extracts all sheets as CSV data
|
||||
- Dependencies: openpyxl
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
self._haveLibs = False
|
||||
|
|
@ -27,6 +37,14 @@ class XlsxExtractor(Extractor):
|
|||
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".xlsx", ".xlsm"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
self._load()
|
||||
parts: List[ContentPart] = []
|
||||
|
|
|
|||
|
|
@ -7,9 +7,26 @@ from ..subRegistry import Extractor
|
|||
|
||||
|
||||
class XmlExtractor(Extractor):
|
||||
"""
|
||||
Extractor for XML files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/xml
|
||||
- File extensions: .xml, .rss, .atom
|
||||
- Special handling: Uses ElementTree for parsing
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".xml", ".rss", ".atom"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/xml"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "application/xml"
|
||||
text = fileBytes.decode("utf-8", errors="replace")
|
||||
|
|
|
|||
|
|
@ -7,12 +7,32 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class Extractor:
|
||||
"""
|
||||
Base class for all document extractors.
|
||||
|
||||
Each extractor should implement:
|
||||
- detect(): Check if this extractor can handle the given file
|
||||
- extract(): Extract content from the file
|
||||
- getSupportedExtensions(): Return supported file extensions
|
||||
- getSupportedMimeTypes(): Return supported MIME types
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
"""Check if this extractor can handle the given file."""
|
||||
return False
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
|
||||
"""Extract content from the file bytes."""
|
||||
raise NotImplementedError
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions (including dots)."""
|
||||
return []
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return []
|
||||
|
||||
|
||||
class Chunker:
|
||||
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
||||
|
|
@ -23,56 +43,86 @@ class ExtractorRegistry:
|
|||
def __init__(self):
|
||||
self._map: Dict[str, Extractor] = {}
|
||||
self._fallback: Optional[Extractor] = None
|
||||
# Register built-ins
|
||||
self._auto_discover_extractors()
|
||||
|
||||
def _auto_discover_extractors(self):
|
||||
"""Auto-discover and register all extractors from the extractors directory."""
|
||||
try:
|
||||
from .extractors.extractorText import TextExtractor
|
||||
from .extractors.extractorCsv import CsvExtractor
|
||||
from .extractors.extractorJson import JsonExtractor
|
||||
from .extractors.extractorXml import XmlExtractor
|
||||
from .extractors.extractorHtml import HtmlExtractor
|
||||
from .extractors.extractorPdf import PdfExtractor
|
||||
from .extractors.extractorDocx import DocxExtractor
|
||||
from .extractors.extractorXlsx import XlsxExtractor
|
||||
from .extractors.extractorPptx import PptxExtractor
|
||||
from .extractors.extractorImage import ImageExtractor
|
||||
from .extractors.extractorBinary import BinaryExtractor
|
||||
self.register("text/plain", TextExtractor())
|
||||
self.register("text/markdown", TextExtractor())
|
||||
self.register("text/csv", CsvExtractor())
|
||||
self.register("application/json", JsonExtractor())
|
||||
self.register("application/xml", XmlExtractor())
|
||||
self.register("text/html", HtmlExtractor())
|
||||
self.register("application/pdf", PdfExtractor())
|
||||
self.register("application/vnd.openxmlformats-officedocument.wordprocessingml.document", DocxExtractor())
|
||||
self.register("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", XlsxExtractor())
|
||||
self.register("application/vnd.openxmlformats-officedocument.presentationml.presentation", PptxExtractor())
|
||||
self.register("application/vnd.ms-powerpoint", PptxExtractor())
|
||||
# images
|
||||
self.register("image/jpeg", ImageExtractor())
|
||||
self.register("image/png", ImageExtractor())
|
||||
self.register("image/gif", ImageExtractor())
|
||||
# extension fallbacks
|
||||
self.register("txt", TextExtractor())
|
||||
self.register("md", TextExtractor())
|
||||
self.register("csv", CsvExtractor())
|
||||
self.register("json", JsonExtractor())
|
||||
self.register("xml", XmlExtractor())
|
||||
self.register("html", HtmlExtractor())
|
||||
self.register("htm", HtmlExtractor())
|
||||
self.register("pdf", PdfExtractor())
|
||||
self.register("docx", DocxExtractor())
|
||||
self.register("xlsx", XlsxExtractor())
|
||||
self.register("xlsm", XlsxExtractor())
|
||||
self.register("pptx", PptxExtractor())
|
||||
self.register("ppt", PptxExtractor())
|
||||
# fallback
|
||||
self.setFallback(BinaryExtractor())
|
||||
logger.info(f"ExtractorRegistry: Successfully registered {len(self._map)} extractors")
|
||||
import os
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
|
||||
# Get the extractors directory
|
||||
current_dir = Path(__file__).parent
|
||||
extractors_dir = current_dir / "extractors"
|
||||
|
||||
if not extractors_dir.exists():
|
||||
logger.error(f"Extractors directory not found: {extractors_dir}")
|
||||
return
|
||||
|
||||
# Import all extractor modules
|
||||
extractor_modules = []
|
||||
for file_path in extractors_dir.glob("extractor*.py"):
|
||||
if file_path.name == "__init__.py":
|
||||
continue
|
||||
|
||||
module_name = file_path.stem
|
||||
try:
|
||||
# Import the module
|
||||
module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
|
||||
|
||||
# Find all extractor classes in the module
|
||||
for attr_name in dir(module):
|
||||
attr = getattr(module, attr_name)
|
||||
if (isinstance(attr, type) and
|
||||
issubclass(attr, Extractor) and
|
||||
attr != Extractor and
|
||||
not attr_name.startswith('_')):
|
||||
|
||||
# Create instance and auto-register
|
||||
extractor_instance = attr()
|
||||
self._auto_register_extractor(extractor_instance)
|
||||
extractor_modules.append(attr_name)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to import {module_name}: {str(e)}")
|
||||
continue
|
||||
|
||||
# Set fallback extractor
|
||||
try:
|
||||
from .extractors.extractorBinary import BinaryExtractor
|
||||
self.setFallback(BinaryExtractor())
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to set fallback extractor: {str(e)}")
|
||||
|
||||
logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
|
||||
logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ExtractorRegistry: Failed to register extractors: {str(e)}")
|
||||
logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def _auto_register_extractor(self, extractor: Extractor):
|
||||
"""Auto-register an extractor based on its declared supported formats."""
|
||||
try:
|
||||
# Register MIME types
|
||||
mime_types = extractor.getSupportedMimeTypes()
|
||||
for mime_type in mime_types:
|
||||
self.register(mime_type, extractor)
|
||||
logger.debug(f"Registered MIME type: {mime_type} → {extractor.__class__.__name__}")
|
||||
|
||||
# Register file extensions
|
||||
extensions = extractor.getSupportedExtensions()
|
||||
for ext in extensions:
|
||||
# Remove leading dot for registry key
|
||||
ext_key = ext.lstrip('.')
|
||||
self.register(ext_key, extractor)
|
||||
logger.debug(f"Registered extension: .{ext_key} → {extractor.__class__.__name__}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
|
||||
|
||||
def register(self, key: str, extractor: Extractor):
|
||||
self._map[key] = extractor
|
||||
|
||||
|
|
@ -89,6 +139,43 @@ class ExtractorRegistry:
|
|||
return self._map[ext]
|
||||
return self._fallback
|
||||
|
||||
def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
|
||||
"""
|
||||
Get all supported formats from all registered extractors.
|
||||
|
||||
Returns:
|
||||
Dictionary with format information:
|
||||
{
|
||||
"extensions": {
|
||||
"extractor_name": [".ext1", ".ext2", ...]
|
||||
},
|
||||
"mime_types": {
|
||||
"extractor_name": ["mime/type1", "mime/type2", ...]
|
||||
}
|
||||
}
|
||||
"""
|
||||
formats = {"extensions": {}, "mime_types": {}}
|
||||
|
||||
# Get formats from registered extractors
|
||||
for key, extractor in self._map.items():
|
||||
if hasattr(extractor, 'getSupportedExtensions'):
|
||||
extensions = extractor.getSupportedExtensions()
|
||||
if extensions:
|
||||
formats["extensions"][key] = extensions
|
||||
|
||||
if hasattr(extractor, 'getSupportedMimeTypes'):
|
||||
mime_types = extractor.getSupportedMimeTypes()
|
||||
if mime_types:
|
||||
formats["mime_types"][key] = mime_types
|
||||
|
||||
# Add fallback extractor info
|
||||
if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
|
||||
formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
|
||||
if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
|
||||
formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
|
||||
|
||||
return formats
|
||||
|
||||
|
||||
class ChunkerRegistry:
|
||||
def __init__(self):
|
||||
|
|
|
|||
|
|
@ -51,7 +51,24 @@ async def process_documents_and_generate_summary():
|
|||
return False
|
||||
|
||||
# Find all supported document files
|
||||
supported_extensions = ["*.pdf", "*.jpg", "*.jpeg", "*.png", "*.gif", "*.docx", "*.xlsx", "*.pptx", "*.ppt", "*.txt", "*.md", "*.html", "*.csv"]
|
||||
supported_extensions = [
|
||||
# Document formats
|
||||
"*.pdf", "*.docx", "*.xlsx", "*.pptx", "*.ppt",
|
||||
# Image formats
|
||||
"*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.bmp", "*.tiff",
|
||||
# Text and code files
|
||||
"*.txt", "*.md", "*.log", "*.rtf", "*.tex", "*.rst", "*.adoc", "*.org", "*.pod",
|
||||
"*.java", "*.js", "*.jsx", "*.ts", "*.tsx", "*.py", "*.rb", "*.go", "*.rs", "*.cpp", "*.c", "*.h", "*.hpp", "*.cc", "*.cxx",
|
||||
"*.cs", "*.php", "*.swift", "*.kt", "*.scala", "*.clj", "*.hs", "*.ml", "*.fs", "*.vb", "*.dart", "*.r", "*.m", "*.pl", "*.sh",
|
||||
"*.html", "*.htm", "*.css", "*.scss", "*.sass", "*.less", "*.vue", "*.svelte",
|
||||
"*.config", "*.ini", "*.cfg", "*.conf", "*.properties", "*.yaml", "*.yml", "*.toml", "*.json", "*.xml",
|
||||
"*.bat", "*.ps1", "*.psm1", "*.psd1", "*.vbs", "*.wsf", "*.cmd", "*.com",
|
||||
"*.csv", "*.tsv", "*.tab", "*.dat", "*.data",
|
||||
"*.man", "*.1", "*.2", "*.3", "*.4", "*.5", "*.6", "*.7", "*.8", "*.9", "*.n", "*.l", "*.m", "*.r", "*.t", "*.x", "*.y", "*.z",
|
||||
"*.diff", "*.patch", "*.gitignore", "*.dockerignore", "*.editorconfig", "*.gitattributes",
|
||||
"*.env", "*.env.local", "*.env.development", "*.env.production", "*.env.test",
|
||||
"*.lock", "*.lockb", "*.lockfile", "*.pkg-lock", "*.yarn-lock"
|
||||
]
|
||||
document_files = []
|
||||
for ext in supported_extensions:
|
||||
document_files.extend(list(testdata_path.glob(ext)))
|
||||
|
|
@ -164,6 +181,8 @@ async def process_documents_and_generate_summary():
|
|||
mime_type = "text/html"
|
||||
elif doc_file.suffix.lower() == '.csv':
|
||||
mime_type = "text/csv"
|
||||
elif doc_file.suffix.lower() == '.json':
|
||||
mime_type = "application/json"
|
||||
elif doc_file.suffix.lower() in ['.txt', '.md']:
|
||||
mime_type = "text/plain"
|
||||
|
||||
|
|
@ -199,7 +218,7 @@ async def process_documents_and_generate_summary():
|
|||
# Run a single end-to-end test to avoid the loop issue
|
||||
logger.info("🧪 Running single end-to-end test...")
|
||||
|
||||
userPrompt = "Analyze these documents and create a comprehensive summary for all input documents, each input document in a separate chapter summarized in 10-20 sentences."
|
||||
userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
|
||||
|
||||
# userPrompt = "Analyze these documents and create a fitting image for the content"
|
||||
|
||||
|
|
@ -215,8 +234,8 @@ async def process_documents_and_generate_summary():
|
|||
prompt=userPrompt,
|
||||
documents=documents,
|
||||
options=ai_options,
|
||||
outputFormat="docx",
|
||||
title="Formulaire"
|
||||
outputFormat="txt",
|
||||
title="Kunden und Use Cases"
|
||||
)
|
||||
|
||||
logger.info(f"✅ End-to-end test completed successfully")
|
||||
|
|
|
|||
117
test_extractor_formats.py
Normal file
117
test_extractor_formats.py
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script to demonstrate enhanced extractor format support.
|
||||
Shows all supported file extensions and MIME types for each extractor.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Add the gateway module to the path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
|
||||
|
||||
from modules.services.serviceExtraction.subRegistry import ExtractorRegistry
|
||||
|
||||
def test_extractor_formats():
|
||||
"""Test and display all supported formats from extractors."""
|
||||
print("🔍 Testing Plug-and-Play Extractor System")
|
||||
print("=" * 60)
|
||||
|
||||
# Create registry
|
||||
registry = ExtractorRegistry()
|
||||
|
||||
# Get all supported formats
|
||||
formats = registry.getAllSupportedFormats()
|
||||
|
||||
print("\n📋 Supported File Extensions by Extractor:")
|
||||
print("-" * 50)
|
||||
for extractor_name, extensions in formats["extensions"].items():
|
||||
if extensions:
|
||||
print(f" {extractor_name:20} → {', '.join(extensions)}")
|
||||
else:
|
||||
print(f" {extractor_name:20} → (all extensions - fallback)")
|
||||
|
||||
print("\n📋 Supported MIME Types by Extractor:")
|
||||
print("-" * 50)
|
||||
for extractor_name, mime_types in formats["mime_types"].items():
|
||||
if mime_types:
|
||||
print(f" {extractor_name:20} → {', '.join(mime_types)}")
|
||||
else:
|
||||
print(f" {extractor_name:20} → (all MIME types - fallback)")
|
||||
|
||||
# Test individual extractors
|
||||
print("\n🧪 Testing Individual Extractors:")
|
||||
print("-" * 50)
|
||||
|
||||
# Get all registered extractors
|
||||
for key, extractor in registry._map.items():
|
||||
if hasattr(extractor, 'getSupportedExtensions') and hasattr(extractor, 'getSupportedMimeTypes'):
|
||||
extensions = extractor.getSupportedExtensions()
|
||||
mime_types = extractor.getSupportedMimeTypes()
|
||||
print(f"\n {extractor.__class__.__name__}:")
|
||||
print(f" Extensions: {extensions}")
|
||||
print(f" MIME Types: {mime_types}")
|
||||
|
||||
# Test detection with various file types
|
||||
print("\n🔬 Testing File Detection:")
|
||||
print("-" * 50)
|
||||
|
||||
test_files = [
|
||||
# Document formats
|
||||
("document.pdf", "application/pdf"),
|
||||
("spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
|
||||
("presentation.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
|
||||
("document.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
|
||||
|
||||
# Text and code files
|
||||
("readme.txt", "text/plain"),
|
||||
("readme.md", "text/markdown"),
|
||||
("app.log", "text/plain"),
|
||||
("Main.java", "text/x-java-source"),
|
||||
("script.js", "text/javascript"),
|
||||
("component.tsx", "text/typescript"),
|
||||
("main.py", "text/x-python"),
|
||||
("config.yaml", "text/x-yaml"),
|
||||
("package.json", "application/json"),
|
||||
("data.csv", "text/csv"),
|
||||
("config.xml", "application/xml"),
|
||||
("webpage.html", "text/html"),
|
||||
("styles.css", "text/css"),
|
||||
("script.sh", "text/x-sh"),
|
||||
("Dockerfile", "text/plain"),
|
||||
(".gitignore", "text/plain"),
|
||||
("app.config", "text/plain"),
|
||||
("database.sql", "text/x-sql"),
|
||||
("schema.ddl", "application/sql"),
|
||||
|
||||
# Images
|
||||
("image.png", "image/png"),
|
||||
("photo.jpg", "image/jpeg"),
|
||||
|
||||
# Unknown
|
||||
("unknown.xyz", "application/octet-stream")
|
||||
]
|
||||
|
||||
for filename, mime_type in test_files:
|
||||
extractor = registry.resolve(mime_type, filename)
|
||||
if extractor:
|
||||
print(f" {filename:25} ({mime_type:50}) → {extractor.__class__.__name__}")
|
||||
else:
|
||||
print(f" {filename:25} ({mime_type:50}) → No extractor found")
|
||||
|
||||
print("\n✅ Plug-and-Play extractor system test completed!")
|
||||
print("\nKey improvements:")
|
||||
print(" • 🔌 TRUE PLUG-AND-PLAY: Just add extractor file, it auto-registers!")
|
||||
print(" • 📋 No more manual registration of file types")
|
||||
print(" • 🔍 Auto-discovery scans extractors directory")
|
||||
print(" • 📝 Each extractor declares its own supported formats")
|
||||
print(" • 🚀 Easy to add new file types - just create new extractor")
|
||||
print(" • 🧹 Clean, maintainable code with no redundancy")
|
||||
print("\nTo add a new file type:")
|
||||
print(" 1. Create extractorXyz.py in extractors/ directory")
|
||||
print(" 2. Implement Extractor interface with getSupportedExtensions()")
|
||||
print(" 3. That's it! No registry changes needed!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_extractor_formats()
|
||||
Loading…
Reference in a new issue