# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Email extractor for EML and MSG files. Parses email headers, body (text/html), and attachments. Attachments are delegated to the ExtractorRegistry for type-specific processing. Optional dependency: extract-msg (for .msg files). """ from typing import Any, Dict, List import email import email.policy import email.utils import io import logging import mimetypes from modules.datamodels.datamodelExtraction import ContentPart from ..subUtils import makeId from ..subRegistry import Extractor logger = logging.getLogger(__name__) _EMAIL_MIME_TYPES = [ "message/rfc822", "application/vnd.ms-outlook", ] _EMAIL_EXTENSIONS = [".eml", ".msg"] class EmailExtractor(Extractor): """Extractor for email files (EML, MSG). Produces: - 1 text ContentPart with header metadata (From, To, Subject, Date) - 1 text ContentPart per body part (plain text / HTML) - Delegated ContentParts for each attachment via ExtractorRegistry """ def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: if mimeType in _EMAIL_MIME_TYPES: return True lower = (fileName or "").lower() return any(lower.endswith(ext) for ext in _EMAIL_EXTENSIONS) def getSupportedExtensions(self) -> list[str]: return list(_EMAIL_EXTENSIONS) def getSupportedMimeTypes(self) -> list[str]: return list(_EMAIL_MIME_TYPES) def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: fileName = context.get("fileName", "email") lower = (fileName or "").lower() if lower.endswith(".msg"): return self._extractMsg(fileBytes, fileName) return self._extractEml(fileBytes, fileName) def _extractEml(self, fileBytes: bytes, fileName: str) -> List[ContentPart]: """Parse standard EML (RFC 822) using stdlib email.""" rootId = makeId() parts: List[ContentPart] = [] try: msg = email.message_from_bytes(fileBytes, policy=email.policy.default) except Exception as e: logger.error(f"EmailExtractor: failed to parse EML: {e}") return [ContentPart( id=rootId, parentId=None, label=fileName, typeGroup="text", mimeType="text/plain", data=f"Failed to parse email: {e}", metadata={"error": True}, )] headerText = _buildHeaderText(msg) parts.append(ContentPart( id=rootId, parentId=None, label="headers", typeGroup="text", mimeType="text/plain", data=headerText, metadata={"emailPart": "headers"}, )) for part in msg.walk(): contentType = part.get_content_type() disposition = str(part.get("Content-Disposition", "")) if part.is_multipart(): continue if "attachment" in disposition: attachName = part.get_filename() or "attachment" attachData = part.get_payload(decode=True) if attachData: parts.extend(_delegateAttachment(attachData, attachName, rootId)) continue if contentType == "text/plain": body = part.get_content() if body: parts.append(ContentPart( id=makeId(), parentId=rootId, label="body_text", typeGroup="text", mimeType="text/plain", data=str(body), metadata={"emailPart": "body"}, )) elif contentType == "text/html": body = part.get_content() if body: parts.append(ContentPart( id=makeId(), parentId=rootId, label="body_html", typeGroup="text", mimeType="text/html", data=str(body), metadata={"emailPart": "body_html"}, )) return parts def _extractMsg(self, fileBytes: bytes, fileName: str) -> List[ContentPart]: """Parse Outlook MSG files using extract-msg (optional).""" rootId = makeId() parts: List[ContentPart] = [] try: import extract_msg except ImportError: logger.warning("extract-msg not installed -- MSG files will be treated as binary") return [ContentPart( id=rootId, parentId=None, label=fileName, typeGroup="text", mimeType="text/plain", data="MSG extraction requires the extract-msg package.", metadata={"warning": True}, )] try: msgFile = extract_msg.Message(io.BytesIO(fileBytes)) except Exception as e: logger.error(f"EmailExtractor: failed to parse MSG: {e}") return [ContentPart( id=rootId, parentId=None, label=fileName, typeGroup="text", mimeType="text/plain", data=f"Failed to parse MSG: {e}", metadata={"error": True}, )] headerLines = [] if msgFile.sender: headerLines.append(f"From: {msgFile.sender}") if msgFile.to: headerLines.append(f"To: {msgFile.to}") if getattr(msgFile, "cc", None): headerLines.append(f"Cc: {msgFile.cc}") if msgFile.subject: headerLines.append(f"Subject: {msgFile.subject}") if msgFile.date: headerLines.append(f"Date: {msgFile.date}") parts.append(ContentPart( id=rootId, parentId=None, label="headers", typeGroup="text", mimeType="text/plain", data="\n".join(headerLines), metadata={"emailPart": "headers"}, )) body = msgFile.body if body: parts.append(ContentPart( id=makeId(), parentId=rootId, label="body_text", typeGroup="text", mimeType="text/plain", data=body, metadata={"emailPart": "body"}, )) htmlBody = getattr(msgFile, "htmlBody", None) if htmlBody: if isinstance(htmlBody, bytes): htmlBody = htmlBody.decode("utf-8", errors="replace") parts.append(ContentPart( id=makeId(), parentId=rootId, label="body_html", typeGroup="text", mimeType="text/html", data=htmlBody, metadata={"emailPart": "body_html"}, )) for attachment in (msgFile.attachments or []): attachName = getattr(attachment, "longFilename", None) or getattr(attachment, "shortFilename", None) or "attachment" attachData = getattr(attachment, "data", None) if attachData: parts.extend(_delegateAttachment(attachData, attachName, rootId)) try: msgFile.close() except Exception: pass return parts def _buildHeaderText(msg) -> str: """Build a readable text summary of key email headers.""" lines = [] for header in ("From", "To", "Cc", "Subject", "Date", "Message-ID"): value = msg.get(header) if value: lines.append(f"{header}: {value}") return "\n".join(lines) def _delegateAttachment(attachData: bytes, attachName: str, parentId: str) -> List[ContentPart]: """Delegate an attachment to the appropriate type-specific extractor.""" guessedMime, _ = mimetypes.guess_type(attachName) detectedMime = guessedMime or "application/octet-stream" from ..subRegistry import ExtractorRegistry registry = ExtractorRegistry() extractor = registry.resolve(detectedMime, attachName) if extractor and not isinstance(extractor, EmailExtractor): try: childParts = extractor.extract(attachData, {"fileName": attachName, "mimeType": detectedMime}) for part in childParts: part.parentId = parentId if not part.metadata: part.metadata = {} part.metadata["emailAttachment"] = attachName return childParts except Exception as e: logger.warning(f"Extractor failed for email attachment {attachName}: {e}") import base64 encodedData = base64.b64encode(attachData).decode("utf-8") if attachData else "" return [ContentPart( id=makeId(), parentId=parentId, label=attachName, typeGroup="binary", mimeType=detectedMime, data=encodedData, metadata={"size": len(attachData), "emailAttachment": attachName}, )]