from typing import Any, Dict, List from modules.datamodels.datamodelExtraction import ContentPart from ..subUtils import makeId from ..subRegistry import Extractor class TextExtractor(Extractor): """ Extractor for plain text files and code files. Supported formats: - MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc. - File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z """ def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: # Check MIME types if mimeType and mimeType.startswith("text/"): return True # Check file extensions if fileName: ext = fileName.lower() return ext.endswith(( # Basic text files ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod", # Programming languages ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx", ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh", # Web technologies ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte", # Configuration files ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml", # Scripts and automation ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com", # Data files ".csv", ".tsv", ".tab", ".dat", ".data", # Documentation ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z", # Other text formats ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes", ".env", ".env.local", ".env.development", ".env.production", ".env.test", ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock" )) return False def getSupportedExtensions(self) -> list[str]: """Return list of supported file extensions.""" return [ # Basic text files ".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod", # Programming languages ".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx", ".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh", # Web technologies ".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte", # Configuration files ".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml", # Scripts and automation ".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com", # Data files ".csv", ".tsv", ".tab", ".dat", ".data", # Documentation ".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z", # Other text formats ".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes", ".env", ".env.local", ".env.development", ".env.production", ".env.test", ".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock" ] def getSupportedMimeTypes(self) -> list[str]: """Return list of supported MIME types.""" return [ "text/plain", "text/markdown", "text/x-python", "text/x-java-source", "text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript", "text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby", "text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin", "text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml", "text/x-ini", "text/x-config", "text/x-properties", "text/x-log", "text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less", "text/xml", "text/csv", "text/tab-separated-values", "text/rtf", "text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org", "application/x-yaml", "application/x-toml", "application/x-ini", "application/x-config", "application/x-properties", "application/x-log" ] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: fileName = context.get("fileName") mimeType = context.get("mimeType") or "text/plain" data = fileBytes.decode("utf-8", errors="replace") return [ContentPart( id=makeId(), parentId=None, label="main", typeGroup="text", mimeType=mimeType, data=data, metadata={"size": len(fileBytes)} )]