gateway/modules/aichat/serviceExtraction/extractors/extractorHtml.py
2026-01-22 21:11:25 +01:00

50 lines
1.5 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
from bs4 import BeautifulSoup
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class HtmlExtractor(Extractor):
"""
Extractor for HTML files.
Supported formats:
- MIME types: text/html
- File extensions: .html, .htm
- Special handling: Uses BeautifulSoup for parsing
- Dependencies: beautifulsoup4
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".html", ".htm"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/html"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "text/html"
text = fileBytes.decode("utf-8", errors="replace")
try:
BeautifulSoup(text, "html.parser")
except Exception:
pass
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="structure",
mimeType=mimeType,
data=text,
metadata={"size": len(fileBytes)}
)]