from typing import Any, Dict, List import xml.etree.ElementTree as ET from modules.datamodels.datamodelExtraction import ContentPart from ..subUtils import makeId from ..subRegistry import Extractor class XmlExtractor(Extractor): """ Extractor for XML files. Supported formats: - MIME types: application/xml - File extensions: .xml, .rss, .atom - Special handling: Uses ElementTree for parsing """ def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom")) def getSupportedExtensions(self) -> list[str]: """Return list of supported file extensions.""" return [".xml", ".rss", ".atom"] def getSupportedMimeTypes(self) -> list[str]: """Return list of supported MIME types.""" return ["application/xml"] def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: mimeType = context.get("mimeType") or "application/xml" text = fileBytes.decode("utf-8", errors="replace") try: ET.fromstring(text) except Exception: pass return [ContentPart( id=makeId(), parentId=None, label="main", typeGroup="structure", mimeType=mimeType, data=text, metadata={"size": len(fileBytes)} )]