gateway/modules/services/serviceExtraction/extractors/extractorHtml.py

from typing import Any, Dict, List
from bs4 import BeautifulSoup

from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor


class HtmlExtractor(Extractor):
    """
    Extractor for HTML files.

    Supported formats:
    - MIME types: text/html
    - File extensions: .html, .htm
    - Special handling: Uses BeautifulSoup for parsing
    - Dependencies: beautifulsoup4
    """

    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))

    def getSupportedExtensions(self) -> list[str]:
        """Return list of supported file extensions."""
        return [".html", ".htm"]

    def getSupportedMimeTypes(self) -> list[str]:
        """Return list of supported MIME types."""
        return ["text/html"]

    def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
        mimeType = context.get("mimeType") or "text/html"
        text = fileBytes.decode("utf-8", errors="replace")
        try:
            BeautifulSoup(text, "html.parser")
        except Exception:
            pass
        return [ContentPart(
            id=makeId(),
            parentId=None,
            label="main",
            typeGroup="structure",
            mimeType=mimeType,
            data=text,
            metadata={"size": len(fileBytes)}
        )]