gateway/modules/services/serviceExtraction/formats/text_extractor.py

26 lines
832 B
Python

from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..utils import makeId
from ..subRegistry import Extractor
class TextExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType in ("text/plain", "text/markdown")
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
mimeType = context.get("mimeType") or "text/plain"
data = fileBytes.decode("utf-8", errors="replace")
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="text",
mimeType=mimeType,
data=data,
metadata={"size": len(fileBytes)}
)]