gateway/modules/serviceCenter/services/serviceExtraction/extractors/extractorEmail.py
2026-03-15 23:38:21 +01:00

230 lines
8.4 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Email extractor for EML and MSG files.
Parses email headers, body (text/html), and attachments.
Attachments are delegated to the ExtractorRegistry for type-specific processing.
Optional dependency: extract-msg (for .msg files).
"""
from typing import Any, Dict, List
import email
import email.policy
import email.utils
import io
import logging
import mimetypes
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
_EMAIL_MIME_TYPES = [
"message/rfc822",
"application/vnd.ms-outlook",
]
_EMAIL_EXTENSIONS = [".eml", ".msg"]
class EmailExtractor(Extractor):
"""Extractor for email files (EML, MSG).
Produces:
- 1 text ContentPart with header metadata (From, To, Subject, Date)
- 1 text ContentPart per body part (plain text / HTML)
- Delegated ContentParts for each attachment via ExtractorRegistry
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
if mimeType in _EMAIL_MIME_TYPES:
return True
lower = (fileName or "").lower()
return any(lower.endswith(ext) for ext in _EMAIL_EXTENSIONS)
def getSupportedExtensions(self) -> list[str]:
return list(_EMAIL_EXTENSIONS)
def getSupportedMimeTypes(self) -> list[str]:
return list(_EMAIL_MIME_TYPES)
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName", "email")
lower = (fileName or "").lower()
if lower.endswith(".msg"):
return self._extractMsg(fileBytes, fileName)
return self._extractEml(fileBytes, fileName)
def _extractEml(self, fileBytes: bytes, fileName: str) -> List[ContentPart]:
"""Parse standard EML (RFC 822) using stdlib email."""
rootId = makeId()
parts: List[ContentPart] = []
try:
msg = email.message_from_bytes(fileBytes, policy=email.policy.default)
except Exception as e:
logger.error(f"EmailExtractor: failed to parse EML: {e}")
return [ContentPart(
id=rootId, parentId=None, label=fileName,
typeGroup="text", mimeType="text/plain",
data=f"Failed to parse email: {e}", metadata={"error": True},
)]
headerText = _buildHeaderText(msg)
parts.append(ContentPart(
id=rootId, parentId=None, label="headers",
typeGroup="text", mimeType="text/plain",
data=headerText, metadata={"emailPart": "headers"},
))
for part in msg.walk():
contentType = part.get_content_type()
disposition = str(part.get("Content-Disposition", ""))
if part.is_multipart():
continue
if "attachment" in disposition:
attachName = part.get_filename() or "attachment"
attachData = part.get_payload(decode=True)
if attachData:
parts.extend(_delegateAttachment(attachData, attachName, rootId))
continue
if contentType == "text/plain":
body = part.get_content()
if body:
parts.append(ContentPart(
id=makeId(), parentId=rootId, label="body_text",
typeGroup="text", mimeType="text/plain",
data=str(body), metadata={"emailPart": "body"},
))
elif contentType == "text/html":
body = part.get_content()
if body:
parts.append(ContentPart(
id=makeId(), parentId=rootId, label="body_html",
typeGroup="text", mimeType="text/html",
data=str(body), metadata={"emailPart": "body_html"},
))
return parts
def _extractMsg(self, fileBytes: bytes, fileName: str) -> List[ContentPart]:
"""Parse Outlook MSG files using extract-msg (optional)."""
rootId = makeId()
parts: List[ContentPart] = []
try:
import extract_msg
except ImportError:
logger.warning("extract-msg not installed -- MSG files will be treated as binary")
return [ContentPart(
id=rootId, parentId=None, label=fileName,
typeGroup="text", mimeType="text/plain",
data="MSG extraction requires the extract-msg package.",
metadata={"warning": True},
)]
try:
msgFile = extract_msg.Message(io.BytesIO(fileBytes))
except Exception as e:
logger.error(f"EmailExtractor: failed to parse MSG: {e}")
return [ContentPart(
id=rootId, parentId=None, label=fileName,
typeGroup="text", mimeType="text/plain",
data=f"Failed to parse MSG: {e}", metadata={"error": True},
)]
headerLines = []
if msgFile.sender:
headerLines.append(f"From: {msgFile.sender}")
if msgFile.to:
headerLines.append(f"To: {msgFile.to}")
if getattr(msgFile, "cc", None):
headerLines.append(f"Cc: {msgFile.cc}")
if msgFile.subject:
headerLines.append(f"Subject: {msgFile.subject}")
if msgFile.date:
headerLines.append(f"Date: {msgFile.date}")
parts.append(ContentPart(
id=rootId, parentId=None, label="headers",
typeGroup="text", mimeType="text/plain",
data="\n".join(headerLines), metadata={"emailPart": "headers"},
))
body = msgFile.body
if body:
parts.append(ContentPart(
id=makeId(), parentId=rootId, label="body_text",
typeGroup="text", mimeType="text/plain",
data=body, metadata={"emailPart": "body"},
))
htmlBody = getattr(msgFile, "htmlBody", None)
if htmlBody:
if isinstance(htmlBody, bytes):
htmlBody = htmlBody.decode("utf-8", errors="replace")
parts.append(ContentPart(
id=makeId(), parentId=rootId, label="body_html",
typeGroup="text", mimeType="text/html",
data=htmlBody, metadata={"emailPart": "body_html"},
))
for attachment in (msgFile.attachments or []):
attachName = getattr(attachment, "longFilename", None) or getattr(attachment, "shortFilename", None) or "attachment"
attachData = getattr(attachment, "data", None)
if attachData:
parts.extend(_delegateAttachment(attachData, attachName, rootId))
try:
msgFile.close()
except Exception:
pass
return parts
def _buildHeaderText(msg) -> str:
"""Build a readable text summary of key email headers."""
lines = []
for header in ("From", "To", "Cc", "Subject", "Date", "Message-ID"):
value = msg.get(header)
if value:
lines.append(f"{header}: {value}")
return "\n".join(lines)
def _delegateAttachment(attachData: bytes, attachName: str, parentId: str) -> List[ContentPart]:
"""Delegate an attachment to the appropriate type-specific extractor."""
guessedMime, _ = mimetypes.guess_type(attachName)
detectedMime = guessedMime or "application/octet-stream"
from ..subRegistry import ExtractorRegistry
registry = ExtractorRegistry()
extractor = registry.resolve(detectedMime, attachName)
if extractor and not isinstance(extractor, EmailExtractor):
try:
childParts = extractor.extract(attachData, {"fileName": attachName, "mimeType": detectedMime})
for part in childParts:
part.parentId = parentId
if not part.metadata:
part.metadata = {}
part.metadata["emailAttachment"] = attachName
return childParts
except Exception as e:
logger.warning(f"Extractor failed for email attachment {attachName}: {e}")
import base64
encodedData = base64.b64encode(attachData).decode("utf-8") if attachData else ""
return [ContentPart(
id=makeId(), parentId=parentId, label=attachName,
typeGroup="binary", mimeType=detectedMime,
data=encodedData,
metadata={"size": len(attachData), "emailAttachment": attachName},
)]