# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Outlook bootstrap for the unified knowledge ingestion lane. Unlike SharePoint, Outlook messages are "virtual documents" — we never persist file bytes in the store. Each message becomes a `sourceKind="outlook_message"` IngestionJob whose `contentObjects` carry the header, snippet and cleaned body so retrieval can show a compact answer without fetching Graph again. Attachments are optional (`includeAttachments` limit flag) and enqueued as child jobs with `sourceKind="outlook_attachment"` + `provenance.parentId`. """ from __future__ import annotations import asyncio import hashlib import logging import time from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody logger = logging.getLogger(__name__) MAX_MESSAGES_DEFAULT = 500 MAX_FOLDERS_DEFAULT = 5 MAX_BODY_CHARS_DEFAULT = 8000 MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024 WELL_KNOWN_FOLDERS = ("inbox", "sentitems") @dataclass class OutlookBootstrapLimits: maxMessages: int = MAX_MESSAGES_DEFAULT maxFolders: int = MAX_FOLDERS_DEFAULT maxBodyChars: int = MAX_BODY_CHARS_DEFAULT includeAttachments: bool = False maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT # Only fetch messages newer than N days. None disables filter. maxAgeDays: Optional[int] = 90 # Content depth: "metadata" | "snippet" | "full" mailContentDepth: str = "full" # Pass-through to IngestionJob.neutralize neutralize: bool = False @dataclass class OutlookBootstrapResult: connectionId: str indexed: int = 0 skippedDuplicate: int = 0 skippedPolicy: int = 0 failed: int = 0 attachmentsIndexed: int = 0 errors: List[str] = field(default_factory=list) def _syntheticMessageId(connectionId: str, messageId: str) -> str: token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16] return f"om:{connectionId[:8]}:{token}" def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str: token = hashlib.sha256( f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8") ).hexdigest()[:16] return f"oa:{connectionId[:8]}:{token}" def _extractRecipient(recipient: Dict[str, Any]) -> str: email = (recipient or {}).get("emailAddress") or {} name = email.get("name") or "" addr = email.get("address") or "" if name and addr: return f"{name} <{addr}>" return addr or name def _joinRecipients(recipients: List[Dict[str, Any]]) -> str: return ", ".join(filter(None, [_extractRecipient(r) for r in recipients or []])) def _buildContentObjects( message: Dict[str, Any], maxBodyChars: int, mailContentDepth: str = "full", ) -> List[Dict[str, Any]]: """Build content objects for an Outlook message. `mailContentDepth` mirrors the Gmail walker: - "metadata": header only - "snippet": header + bodyPreview (~255 chars) - "full": header + snippet + cleaned body (default) """ subject = message.get("subject") or "(no subject)" fromAddr = _extractRecipient(message.get("from") or {}) toAddr = _joinRecipients(message.get("toRecipients") or []) ccAddr = _joinRecipients(message.get("ccRecipients") or []) received = message.get("receivedDateTime") or "" snippet = message.get("bodyPreview") or "" parts: List[Dict[str, Any]] = [] header = ( f"Subject: {subject}\n" f"From: {fromAddr}\n" f"To: {toAddr}\n" + (f"Cc: {ccAddr}\n" if ccAddr else "") + f"Date: {received}" ) parts.append({ "contentObjectId": "header", "contentType": "text", "data": header, "contextRef": {"part": "header"}, }) if mailContentDepth in ("snippet", "full") and snippet: parts.append({ "contentObjectId": "snippet", "contentType": "text", "data": snippet, "contextRef": {"part": "snippet"}, }) if mailContentDepth == "full": body = message.get("body") or {} bodyContent = body.get("content") or "" cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) if bodyContent else "" if cleanedBody: parts.append({ "contentObjectId": "body", "contentType": "text", "data": cleanedBody, "contextRef": {"part": "body"}, }) return parts async def bootstrapOutlook( connectionId: str, *, progressCb: Optional[Callable[[int, Optional[str]], None]] = None, adapter: Any = None, connection: Any = None, knowledgeService: Any = None, limits: Optional[OutlookBootstrapLimits] = None, ) -> Dict[str, Any]: """Enumerate Outlook folders (inbox + sent by default) and ingest messages.""" from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs prefs = loadConnectionPrefs(connectionId) if not limits: limits = OutlookBootstrapLimits( includeAttachments=prefs.mailIndexAttachments, maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, mailContentDepth=prefs.mailContentDepth, neutralize=prefs.neutralizeBeforeEmbed, ) startMs = time.time() result = OutlookBootstrapResult(connectionId=connectionId) logger.info( "ingestion.connection.bootstrap.started part=outlook connectionId=%s", connectionId, extra={ "event": "ingestion.connection.bootstrap.started", "part": "outlook", "connectionId": connectionId, }, ) if adapter is None or knowledgeService is None or connection is None: adapter, connection, knowledgeService = await _resolveDependencies(connectionId) mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" folderIds = await _selectFolderIds(adapter, limits) for folderId in folderIds: if result.indexed + result.skippedDuplicate >= limits.maxMessages: break try: await _ingestFolder( adapter=adapter, knowledgeService=knowledgeService, connectionId=connectionId, mandateId=mandateId, userId=userId, folderId=folderId, limits=limits, result=result, progressCb=progressCb, ) except Exception as exc: logger.error("outlook ingestion folder %s failed: %s", folderId, exc, exc_info=True) result.errors.append(f"folder({folderId}): {exc}") return _finalizeResult(connectionId, result, startMs) async def _resolveDependencies(connectionId: str): from modules.interfaces.interfaceDbApp import getRootInterface from modules.auth import TokenManager from modules.connectors.providerMsft.connectorMsft import MsftConnector from modules.serviceCenter import getService from modules.serviceCenter.context import ServiceCenterContext from modules.security.rootAccess import getRootUser rootInterface = getRootInterface() connection = rootInterface.getUserConnectionById(connectionId) if connection is None: raise ValueError(f"UserConnection not found: {connectionId}") token = TokenManager().getFreshToken(connectionId) if not token or not token.tokenAccess: raise ValueError(f"No valid token for connection {connectionId}") provider = MsftConnector(connection, token.tokenAccess) adapter = provider.getServiceAdapter("outlook") rootUser = getRootUser() ctx = ServiceCenterContext( user=rootUser, mandate_id=str(getattr(connection, "mandateId", "") or ""), ) knowledgeService = getService("knowledge", ctx) return adapter, connection, knowledgeService async def _selectFolderIds(adapter, limits: OutlookBootstrapLimits) -> List[str]: """Prefer well-known folders (inbox, sentitems); fall back to browse().""" folderIds: List[str] = [] for wellKnown in WELL_KNOWN_FOLDERS: if len(folderIds) >= limits.maxFolders: break try: row = await adapter._graphGet(f"me/mailFolders/{wellKnown}") except Exception: row = None if isinstance(row, dict) and "error" not in row and row.get("id"): folderIds.append(row["id"]) if len(folderIds) < limits.maxFolders: try: entries = await adapter.browse("/") except Exception: entries = [] for entry in entries: metadata = getattr(entry, "metadata", {}) or {} fid = metadata.get("id") if fid and fid not in folderIds: folderIds.append(fid) if len(folderIds) >= limits.maxFolders: break return folderIds async def _ingestFolder( *, adapter, knowledgeService, connectionId: str, mandateId: str, userId: str, folderId: str, limits: OutlookBootstrapLimits, result: OutlookBootstrapResult, progressCb: Optional[Callable[[int, Optional[str]], None]], ) -> None: remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate) if remaining <= 0: return pageSize = min(100, remaining) select = ( "id,subject,from,toRecipients,ccRecipients,receivedDateTime," "bodyPreview,body,internetMessageId,hasAttachments,changeKey" ) endpoint: Optional[str] = ( f"me/mailFolders/{folderId}/messages" f"?$top={pageSize}&$orderby=receivedDateTime desc&$select={select}" ) # Keep header-based age filter in Graph itself to avoid shipping ancient # messages we'd discard client-side. if limits.maxAgeDays: from datetime import datetime, timezone, timedelta cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays) cutoffIso = cutoff.strftime("%Y-%m-%dT%H:%M:%SZ") endpoint = f"{endpoint}&$filter=receivedDateTime ge {cutoffIso}" while endpoint and (result.indexed + result.skippedDuplicate) < limits.maxMessages: try: page = await adapter._graphGet(endpoint) except Exception as exc: logger.warning("outlook graph page failed for folder %s: %s", folderId, exc) result.errors.append(f"graph({folderId}): {exc}") return if not isinstance(page, dict) or "error" in page: err = (page or {}).get("error") if isinstance(page, dict) else "unknown" logger.warning("outlook graph page error for folder %s: %s", folderId, err) result.errors.append(f"graph({folderId}): {err}") return for message in page.get("value", []) or []: if result.indexed + result.skippedDuplicate >= limits.maxMessages: break await _ingestMessage( adapter=adapter, knowledgeService=knowledgeService, connectionId=connectionId, mandateId=mandateId, userId=userId, message=message, limits=limits, result=result, progressCb=progressCb, ) nextLink = page.get("@odata.nextLink") if not nextLink: break # Strip Graph base so adapter._graphGet accepts the relative path. from modules.connectors.providerMsft.connectorMsft import _stripGraphBase endpoint = _stripGraphBase(nextLink) async def _ingestMessage( *, adapter, knowledgeService, connectionId: str, mandateId: str, userId: str, message: Dict[str, Any], limits: OutlookBootstrapLimits, result: OutlookBootstrapResult, progressCb: Optional[Callable[[int, Optional[str]], None]], ) -> None: from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob messageId = message.get("id") if not messageId: result.skippedPolicy += 1 return revision = message.get("changeKey") or message.get("internetMessageId") subject = message.get("subject") or "(no subject)" syntheticId = _syntheticMessageId(connectionId, messageId) fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml" contentObjects = _buildContentObjects( message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth ) # Always at least the header is emitted, so `contentObjects` is non-empty. try: handle = await knowledgeService.requestIngestion( IngestionJob( sourceKind="outlook_message", sourceId=syntheticId, fileName=fileName, mimeType="message/rfc822", userId=userId, mandateId=mandateId, contentObjects=contentObjects, contentVersion=revision, neutralize=limits.neutralize, provenance={ "connectionId": connectionId, "authority": "msft", "service": "outlook", "externalItemId": messageId, "internetMessageId": message.get("internetMessageId"), "tier": limits.mailContentDepth, }, ) ) except Exception as exc: logger.error("outlook ingestion %s failed: %s", messageId, exc, exc_info=True) result.failed += 1 result.errors.append(f"ingest({messageId}): {exc}") return if handle.status == "duplicate": result.skippedDuplicate += 1 elif handle.status == "indexed": result.indexed += 1 else: result.failed += 1 if limits.includeAttachments and message.get("hasAttachments"): try: await _ingestAttachments( adapter=adapter, knowledgeService=knowledgeService, connectionId=connectionId, mandateId=mandateId, userId=userId, messageId=messageId, parentSyntheticId=syntheticId, limits=limits, result=result, ) except Exception as exc: logger.warning("outlook attachments %s failed: %s", messageId, exc) result.errors.append(f"attachments({messageId}): {exc}") if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: processed = result.indexed + result.skippedDuplicate try: progressCb( min(90, 10 + int(80 * processed / max(1, limits.maxMessages))), f"outlook processed={processed}", ) except Exception: pass logger.info( "ingestion.connection.bootstrap.progress part=outlook processed=%d skippedDup=%d failed=%d", processed, result.skippedDuplicate, result.failed, extra={ "event": "ingestion.connection.bootstrap.progress", "part": "outlook", "connectionId": connectionId, "processed": processed, "skippedDup": result.skippedDuplicate, "failed": result.failed, }, ) await asyncio.sleep(0) async def _ingestAttachments( *, adapter, knowledgeService, connectionId: str, mandateId: str, userId: str, messageId: str, parentSyntheticId: str, limits: OutlookBootstrapLimits, result: OutlookBootstrapResult, ) -> None: """Child ingestion jobs for file attachments (skip inline & oversized).""" from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob from modules.datamodels.datamodelExtraction import ExtractionOptions from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction from modules.serviceCenter.services.serviceExtraction.subRegistry import ( ExtractorRegistry, ChunkerRegistry, ) import base64 page = await adapter._graphGet(f"me/messages/{messageId}/attachments") if not isinstance(page, dict) or "error" in page: return extractorRegistry = ExtractorRegistry() chunkerRegistry = ChunkerRegistry() for attachment in page.get("value", []) or []: if attachment.get("@odata.type") != "#microsoft.graph.fileAttachment": continue if attachment.get("isInline"): continue size = int(attachment.get("size") or 0) if size and size > limits.maxAttachmentBytes: result.skippedPolicy += 1 continue contentBytesB64 = attachment.get("contentBytes") if not contentBytesB64: continue try: rawBytes = base64.b64decode(contentBytesB64) except Exception: result.skippedPolicy += 1 continue fileName = attachment.get("name") or "attachment" mimeType = attachment.get("contentType") or "application/octet-stream" attachmentId = attachment.get("id") or fileName syntheticId = _syntheticAttachmentId(connectionId, messageId, attachmentId) try: extracted = runExtraction( extractorRegistry, chunkerRegistry, rawBytes, fileName, mimeType, ExtractionOptions(mergeStrategy=None), ) except Exception as exc: logger.warning("outlook attachment extract %s failed: %s", attachmentId, exc) result.failed += 1 continue contentObjects: List[Dict[str, Any]] = [] for part in getattr(extracted, "parts", None) or []: data = getattr(part, "data", None) or "" if not data or not str(data).strip(): continue typeGroup = getattr(part, "typeGroup", "text") or "text" contentType = "text" if typeGroup == "image": contentType = "image" elif typeGroup in ("binary", "container"): contentType = "other" contentObjects.append({ "contentObjectId": getattr(part, "id", ""), "contentType": contentType, "data": data, "contextRef": { "containerPath": fileName, "location": getattr(part, "label", None) or "attachment", **(getattr(part, "metadata", None) or {}), }, }) if not contentObjects: result.skippedPolicy += 1 continue try: await knowledgeService.requestIngestion( IngestionJob( sourceKind="outlook_attachment", sourceId=syntheticId, fileName=fileName, mimeType=mimeType, userId=userId, mandateId=mandateId, contentObjects=contentObjects, neutralize=limits.neutralize, provenance={ "connectionId": connectionId, "authority": "msft", "service": "outlook", "parentId": parentSyntheticId, "externalItemId": attachmentId, "parentMessageId": messageId, }, ) ) result.attachmentsIndexed += 1 except Exception as exc: logger.warning("outlook attachment ingest %s failed: %s", attachmentId, exc) result.failed += 1 def _finalizeResult(connectionId: str, result: OutlookBootstrapResult, startMs: float) -> Dict[str, Any]: durationMs = int((time.time() - startMs) * 1000) logger.info( "ingestion.connection.bootstrap.done part=outlook connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d", connectionId, result.indexed, result.skippedDuplicate, result.skippedPolicy, result.attachmentsIndexed, result.failed, durationMs, extra={ "event": "ingestion.connection.bootstrap.done", "part": "outlook", "connectionId": connectionId, "indexed": result.indexed, "skippedDup": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "attachmentsIndexed": result.attachmentsIndexed, "failed": result.failed, "durationMs": durationMs, }, ) return { "connectionId": result.connectionId, "indexed": result.indexed, "skippedDuplicate": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "attachmentsIndexed": result.attachmentsIndexed, "failed": result.failed, "durationMs": durationMs, "errors": result.errors[:20], }