gateway/modules/services/serviceExtraction/formats/xml_extractor.py
2025-10-03 01:41:50 +02:00

30 lines
974 B
Python

from typing import Any, Dict, List
import xml.etree.ElementTree as ET
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class XmlExtractor(Extractor):
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/xml"
text = fileBytes.decode("utf-8", errors="replace")
try:
ET.fromstring(text)
except Exception:
pass
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="structure",
mimeType=mimeType,
data=text,
metadata={"size": len(fileBytes)}
)]