230 lines
8.4 KiB
Python
230 lines
8.4 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Email extractor for EML and MSG files.
|
|
|
|
Parses email headers, body (text/html), and attachments.
|
|
Attachments are delegated to the ExtractorRegistry for type-specific processing.
|
|
|
|
Optional dependency: extract-msg (for .msg files).
|
|
"""
|
|
|
|
from typing import Any, Dict, List
|
|
import email
|
|
import email.policy
|
|
import email.utils
|
|
import io
|
|
import logging
|
|
import mimetypes
|
|
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
from ..subUtils import makeId
|
|
from ..subRegistry import Extractor
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_EMAIL_MIME_TYPES = [
|
|
"message/rfc822",
|
|
"application/vnd.ms-outlook",
|
|
]
|
|
_EMAIL_EXTENSIONS = [".eml", ".msg"]
|
|
|
|
|
|
class EmailExtractor(Extractor):
|
|
"""Extractor for email files (EML, MSG).
|
|
|
|
Produces:
|
|
- 1 text ContentPart with header metadata (From, To, Subject, Date)
|
|
- 1 text ContentPart per body part (plain text / HTML)
|
|
- Delegated ContentParts for each attachment via ExtractorRegistry
|
|
"""
|
|
|
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
|
if mimeType in _EMAIL_MIME_TYPES:
|
|
return True
|
|
lower = (fileName or "").lower()
|
|
return any(lower.endswith(ext) for ext in _EMAIL_EXTENSIONS)
|
|
|
|
def getSupportedExtensions(self) -> list[str]:
|
|
return list(_EMAIL_EXTENSIONS)
|
|
|
|
def getSupportedMimeTypes(self) -> list[str]:
|
|
return list(_EMAIL_MIME_TYPES)
|
|
|
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
|
fileName = context.get("fileName", "email")
|
|
lower = (fileName or "").lower()
|
|
|
|
if lower.endswith(".msg"):
|
|
return self._extractMsg(fileBytes, fileName)
|
|
return self._extractEml(fileBytes, fileName)
|
|
|
|
def _extractEml(self, fileBytes: bytes, fileName: str) -> List[ContentPart]:
|
|
"""Parse standard EML (RFC 822) using stdlib email."""
|
|
rootId = makeId()
|
|
parts: List[ContentPart] = []
|
|
|
|
try:
|
|
msg = email.message_from_bytes(fileBytes, policy=email.policy.default)
|
|
except Exception as e:
|
|
logger.error(f"EmailExtractor: failed to parse EML: {e}")
|
|
return [ContentPart(
|
|
id=rootId, parentId=None, label=fileName,
|
|
typeGroup="text", mimeType="text/plain",
|
|
data=f"Failed to parse email: {e}", metadata={"error": True},
|
|
)]
|
|
|
|
headerText = _buildHeaderText(msg)
|
|
parts.append(ContentPart(
|
|
id=rootId, parentId=None, label="headers",
|
|
typeGroup="text", mimeType="text/plain",
|
|
data=headerText, metadata={"emailPart": "headers"},
|
|
))
|
|
|
|
for part in msg.walk():
|
|
contentType = part.get_content_type()
|
|
disposition = str(part.get("Content-Disposition", ""))
|
|
|
|
if part.is_multipart():
|
|
continue
|
|
|
|
if "attachment" in disposition:
|
|
attachName = part.get_filename() or "attachment"
|
|
attachData = part.get_payload(decode=True)
|
|
if attachData:
|
|
parts.extend(_delegateAttachment(attachData, attachName, rootId))
|
|
continue
|
|
|
|
if contentType == "text/plain":
|
|
body = part.get_content()
|
|
if body:
|
|
parts.append(ContentPart(
|
|
id=makeId(), parentId=rootId, label="body_text",
|
|
typeGroup="text", mimeType="text/plain",
|
|
data=str(body), metadata={"emailPart": "body"},
|
|
))
|
|
elif contentType == "text/html":
|
|
body = part.get_content()
|
|
if body:
|
|
parts.append(ContentPart(
|
|
id=makeId(), parentId=rootId, label="body_html",
|
|
typeGroup="text", mimeType="text/html",
|
|
data=str(body), metadata={"emailPart": "body_html"},
|
|
))
|
|
|
|
return parts
|
|
|
|
def _extractMsg(self, fileBytes: bytes, fileName: str) -> List[ContentPart]:
|
|
"""Parse Outlook MSG files using extract-msg (optional)."""
|
|
rootId = makeId()
|
|
parts: List[ContentPart] = []
|
|
|
|
try:
|
|
import extract_msg
|
|
except ImportError:
|
|
logger.warning("extract-msg not installed -- MSG files will be treated as binary")
|
|
return [ContentPart(
|
|
id=rootId, parentId=None, label=fileName,
|
|
typeGroup="text", mimeType="text/plain",
|
|
data="MSG extraction requires the extract-msg package.",
|
|
metadata={"warning": True},
|
|
)]
|
|
|
|
try:
|
|
msgFile = extract_msg.Message(io.BytesIO(fileBytes))
|
|
except Exception as e:
|
|
logger.error(f"EmailExtractor: failed to parse MSG: {e}")
|
|
return [ContentPart(
|
|
id=rootId, parentId=None, label=fileName,
|
|
typeGroup="text", mimeType="text/plain",
|
|
data=f"Failed to parse MSG: {e}", metadata={"error": True},
|
|
)]
|
|
|
|
headerLines = []
|
|
if msgFile.sender:
|
|
headerLines.append(f"From: {msgFile.sender}")
|
|
if msgFile.to:
|
|
headerLines.append(f"To: {msgFile.to}")
|
|
if getattr(msgFile, "cc", None):
|
|
headerLines.append(f"Cc: {msgFile.cc}")
|
|
if msgFile.subject:
|
|
headerLines.append(f"Subject: {msgFile.subject}")
|
|
if msgFile.date:
|
|
headerLines.append(f"Date: {msgFile.date}")
|
|
|
|
parts.append(ContentPart(
|
|
id=rootId, parentId=None, label="headers",
|
|
typeGroup="text", mimeType="text/plain",
|
|
data="\n".join(headerLines), metadata={"emailPart": "headers"},
|
|
))
|
|
|
|
body = msgFile.body
|
|
if body:
|
|
parts.append(ContentPart(
|
|
id=makeId(), parentId=rootId, label="body_text",
|
|
typeGroup="text", mimeType="text/plain",
|
|
data=body, metadata={"emailPart": "body"},
|
|
))
|
|
|
|
htmlBody = getattr(msgFile, "htmlBody", None)
|
|
if htmlBody:
|
|
if isinstance(htmlBody, bytes):
|
|
htmlBody = htmlBody.decode("utf-8", errors="replace")
|
|
parts.append(ContentPart(
|
|
id=makeId(), parentId=rootId, label="body_html",
|
|
typeGroup="text", mimeType="text/html",
|
|
data=htmlBody, metadata={"emailPart": "body_html"},
|
|
))
|
|
|
|
for attachment in (msgFile.attachments or []):
|
|
attachName = getattr(attachment, "longFilename", None) or getattr(attachment, "shortFilename", None) or "attachment"
|
|
attachData = getattr(attachment, "data", None)
|
|
if attachData:
|
|
parts.extend(_delegateAttachment(attachData, attachName, rootId))
|
|
|
|
try:
|
|
msgFile.close()
|
|
except Exception:
|
|
pass
|
|
|
|
return parts
|
|
|
|
|
|
def _buildHeaderText(msg) -> str:
|
|
"""Build a readable text summary of key email headers."""
|
|
lines = []
|
|
for header in ("From", "To", "Cc", "Subject", "Date", "Message-ID"):
|
|
value = msg.get(header)
|
|
if value:
|
|
lines.append(f"{header}: {value}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _delegateAttachment(attachData: bytes, attachName: str, parentId: str) -> List[ContentPart]:
|
|
"""Delegate an attachment to the appropriate type-specific extractor."""
|
|
guessedMime, _ = mimetypes.guess_type(attachName)
|
|
detectedMime = guessedMime or "application/octet-stream"
|
|
|
|
from ..subRegistry import ExtractorRegistry
|
|
registry = ExtractorRegistry()
|
|
extractor = registry.resolve(detectedMime, attachName)
|
|
|
|
if extractor and not isinstance(extractor, EmailExtractor):
|
|
try:
|
|
childParts = extractor.extract(attachData, {"fileName": attachName, "mimeType": detectedMime})
|
|
for part in childParts:
|
|
part.parentId = parentId
|
|
if not part.metadata:
|
|
part.metadata = {}
|
|
part.metadata["emailAttachment"] = attachName
|
|
return childParts
|
|
except Exception as e:
|
|
logger.warning(f"Extractor failed for email attachment {attachName}: {e}")
|
|
|
|
import base64
|
|
encodedData = base64.b64encode(attachData).decode("utf-8") if attachData else ""
|
|
return [ContentPart(
|
|
id=makeId(), parentId=parentId, label=attachName,
|
|
typeGroup="binary", mimeType=detectedMime,
|
|
data=encodedData,
|
|
metadata={"size": len(attachData), "emailAttachment": attachName},
|
|
)]
|