105 lines
5.3 KiB
Python
105 lines
5.3 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
from typing import Any, Dict, List
|
|
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
from ..subUtils import makeId
|
|
from ..subRegistry import Extractor
|
|
|
|
|
|
class TextExtractor(Extractor):
|
|
"""
|
|
Extractor for plain text files and code files.
|
|
|
|
Supported formats:
|
|
- MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
|
|
- File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
|
|
"""
|
|
|
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
|
# Check MIME types
|
|
if mimeType and mimeType.startswith("text/"):
|
|
return True
|
|
|
|
# Check file extensions
|
|
if fileName:
|
|
ext = fileName.lower()
|
|
return ext.endswith((
|
|
# Basic text files
|
|
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
|
# Programming languages
|
|
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
|
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
|
# Web technologies
|
|
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
|
# Configuration files
|
|
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
|
# Scripts and automation
|
|
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
|
# Data files
|
|
".csv", ".tsv", ".tab", ".dat", ".data",
|
|
# Documentation
|
|
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
|
# Other text formats
|
|
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
|
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
|
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
|
))
|
|
|
|
return False
|
|
|
|
def getSupportedExtensions(self) -> list[str]:
|
|
"""Return list of supported file extensions."""
|
|
return [
|
|
# Basic text files
|
|
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
|
# Programming languages
|
|
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
|
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
|
# Web technologies
|
|
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
|
# Configuration files
|
|
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
|
# Scripts and automation
|
|
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
|
# Data files
|
|
".csv", ".tsv", ".tab", ".dat", ".data",
|
|
# Documentation
|
|
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
|
# Other text formats
|
|
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
|
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
|
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
|
]
|
|
|
|
def getSupportedMimeTypes(self) -> list[str]:
|
|
"""Return list of supported MIME types."""
|
|
return [
|
|
"text/plain", "text/markdown", "text/x-python", "text/x-java-source",
|
|
"text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
|
|
"text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
|
|
"text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
|
|
"text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
|
|
"text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
|
|
"text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
|
|
"text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
|
|
"text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
|
|
"application/x-yaml", "application/x-toml", "application/x-ini",
|
|
"application/x-config", "application/x-properties", "application/x-log"
|
|
]
|
|
|
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
|
fileName = context.get("fileName")
|
|
mimeType = context.get("mimeType") or "text/plain"
|
|
data = fileBytes.decode("utf-8", errors="replace")
|
|
return [ContentPart(
|
|
id=makeId(),
|
|
parentId=None,
|
|
label="main",
|
|
typeGroup="text",
|
|
mimeType=mimeType,
|
|
data=data,
|
|
metadata={"size": len(fileBytes)}
|
|
)]
|
|
|
|
|