from typing import Any, Dict, List from bs4 import BeautifulSoup from modules.datamodels.datamodelExtraction import ContentPart from ..subUtils import makeId from ..subRegistry import Extractor class HtmlExtractor(Extractor): def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm")) def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: mimeType = context.get("mimeType") or "text/html" text = fileBytes.decode("utf-8", errors="replace") try: BeautifulSoup(text, "html.parser") except Exception: pass return [ContentPart( id=makeId(), parentId=None, label="main", typeGroup="structure", mimeType=mimeType, data=text, metadata={"size": len(fileBytes)} )]