gateway/modules/serviceCenter/services/serviceExtraction/extractors/extractorText.py
2026-03-06 14:03:18 +01:00

105 lines
5.3 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class TextExtractor(Extractor):
"""
Extractor for plain text files and code files.
Supported formats:
- MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
- File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
# Check MIME types
if mimeType and mimeType.startswith("text/"):
return True
# Check file extensions
if fileName:
ext = fileName.lower()
return ext.endswith((
# Basic text files
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
# Programming languages
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
# Web technologies
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
# Configuration files
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
# Scripts and automation
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
# Data files
".csv", ".tsv", ".tab", ".dat", ".data",
# Documentation
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
# Other text formats
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
".env", ".env.local", ".env.development", ".env.production", ".env.test",
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
))
return False
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [
# Basic text files
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
# Programming languages
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
# Web technologies
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
# Configuration files
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
# Scripts and automation
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
# Data files
".csv", ".tsv", ".tab", ".dat", ".data",
# Documentation
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
# Other text formats
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
".env", ".env.local", ".env.development", ".env.production", ".env.test",
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return [
"text/plain", "text/markdown", "text/x-python", "text/x-java-source",
"text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
"text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
"text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
"text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
"text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
"text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
"text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
"text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
"application/x-yaml", "application/x-toml", "application/x-ini",
"application/x-config", "application/x-properties", "application/x-log"
]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
mimeType = context.get("mimeType") or "text/plain"
data = fileBytes.decode("utf-8", errors="replace")
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="text",
mimeType=mimeType,
data=data,
metadata={"size": len(fileBytes)}
)]