48 lines
1.4 KiB
Python
48 lines
1.4 KiB
Python
from typing import Any, Dict, List
|
|
from bs4 import BeautifulSoup
|
|
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
from ..subUtils import makeId
|
|
from ..subRegistry import Extractor
|
|
|
|
|
|
class HtmlExtractor(Extractor):
|
|
"""
|
|
Extractor for HTML files.
|
|
|
|
Supported formats:
|
|
- MIME types: text/html
|
|
- File extensions: .html, .htm
|
|
- Special handling: Uses BeautifulSoup for parsing
|
|
- Dependencies: beautifulsoup4
|
|
"""
|
|
|
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
|
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
|
|
|
|
def getSupportedExtensions(self) -> list[str]:
|
|
"""Return list of supported file extensions."""
|
|
return [".html", ".htm"]
|
|
|
|
def getSupportedMimeTypes(self) -> list[str]:
|
|
"""Return list of supported MIME types."""
|
|
return ["text/html"]
|
|
|
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
|
mimeType = context.get("mimeType") or "text/html"
|
|
text = fileBytes.decode("utf-8", errors="replace")
|
|
try:
|
|
BeautifulSoup(text, "html.parser")
|
|
except Exception:
|
|
pass
|
|
return [ContentPart(
|
|
id=makeId(),
|
|
parentId=None,
|
|
label="main",
|
|
typeGroup="structure",
|
|
mimeType=mimeType,
|
|
data=text,
|
|
metadata={"size": len(fileBytes)}
|
|
)]
|
|
|
|
|