gateway/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py
Ida 6a5ff1ff7c feat(rag): P1 user-connection hooks + retrieval threshold fix
- connection.established/revoked callbacks from OAuth routes and
  connection management endpoints
- KnowledgeIngestionConsumer dispatches bootstrap job (established)
  and synchronous purge (revoked)
- FileContentIndex: add connectionId + sourceKind columns
- SharePoint bootstrap with @odata.nextLink pagination and eTag-based
  idempotency
- Outlook bootstrap treats messages as virtual documents with
  cleanEmailBody for HTML/quote/signature stripping
- fix(rag): lower buildAgentContext minScore thresholds from
  0.55/0.65/0.70 to 0.35 — previous values blocked all real matches
  from text-embedding-3-small
- 24 new unit tests covering purge, consumer dispatch, email cleaning
  and both bootstrap paths
2026-04-29 14:39:40 +02:00

551 lines
20 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Outlook bootstrap for the unified knowledge ingestion lane.
Unlike SharePoint, Outlook messages are "virtual documents" — we never persist
file bytes in the store. Each message becomes a `sourceKind="outlook_message"`
IngestionJob whose `contentObjects` carry the header, snippet and cleaned body
so retrieval can show a compact answer without fetching Graph again.
Attachments are optional (`includeAttachments` limit flag) and enqueued as
child jobs with `sourceKind="outlook_attachment"` + `provenance.parentId`.
"""
from __future__ import annotations
import asyncio
import hashlib
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional
from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
logger = logging.getLogger(__name__)
MAX_MESSAGES_DEFAULT = 500
MAX_FOLDERS_DEFAULT = 5
MAX_BODY_CHARS_DEFAULT = 8000
MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
WELL_KNOWN_FOLDERS = ("inbox", "sentitems")
@dataclass
class OutlookBootstrapLimits:
maxMessages: int = MAX_MESSAGES_DEFAULT
maxFolders: int = MAX_FOLDERS_DEFAULT
maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
includeAttachments: bool = False
maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
# Only fetch messages newer than N days. None disables filter.
maxAgeDays: Optional[int] = 90
@dataclass
class OutlookBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
attachmentsIndexed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticMessageId(connectionId: str, messageId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
return f"om:{connectionId[:8]}:{token}"
def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
token = hashlib.sha256(
f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
).hexdigest()[:16]
return f"oa:{connectionId[:8]}:{token}"
def _extractRecipient(recipient: Dict[str, Any]) -> str:
email = (recipient or {}).get("emailAddress") or {}
name = email.get("name") or ""
addr = email.get("address") or ""
if name and addr:
return f"{name} <{addr}>"
return addr or name
def _joinRecipients(recipients: List[Dict[str, Any]]) -> str:
return ", ".join(filter(None, [_extractRecipient(r) for r in recipients or []]))
def _buildContentObjects(message: Dict[str, Any], maxBodyChars: int) -> List[Dict[str, Any]]:
subject = message.get("subject") or "(no subject)"
fromAddr = _extractRecipient(message.get("from") or {})
toAddr = _joinRecipients(message.get("toRecipients") or [])
ccAddr = _joinRecipients(message.get("ccRecipients") or [])
received = message.get("receivedDateTime") or ""
snippet = message.get("bodyPreview") or ""
body = message.get("body") or {}
bodyContent = body.get("content") or ""
bodyType = (body.get("contentType") or "").lower()
if bodyType == "html" or (bodyContent and "<" in bodyContent and ">" in bodyContent):
cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars)
else:
cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) if bodyContent else ""
parts: List[Dict[str, Any]] = []
header = (
f"Subject: {subject}\n"
f"From: {fromAddr}\n"
f"To: {toAddr}\n"
+ (f"Cc: {ccAddr}\n" if ccAddr else "")
+ f"Date: {received}"
)
parts.append({
"contentObjectId": "header",
"contentType": "text",
"data": header,
"contextRef": {"part": "header"},
})
if snippet:
parts.append({
"contentObjectId": "snippet",
"contentType": "text",
"data": snippet,
"contextRef": {"part": "snippet"},
})
if cleanedBody:
parts.append({
"contentObjectId": "body",
"contentType": "text",
"data": cleanedBody,
"contextRef": {"part": "body"},
})
return parts
async def bootstrapOutlook(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[OutlookBootstrapLimits] = None,
) -> Dict[str, Any]:
"""Enumerate Outlook folders (inbox + sent by default) and ingest messages."""
limits = limits or OutlookBootstrapLimits()
startMs = time.time()
result = OutlookBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=outlook connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "outlook",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
folderIds = await _selectFolderIds(adapter, limits)
for folderId in folderIds:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
try:
await _ingestFolder(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderId=folderId,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("outlook ingestion folder %s failed: %s", folderId, exc, exc_info=True)
result.errors.append(f"folder({folderId}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerMsft.connectorMsft import MsftConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = MsftConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("outlook")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _selectFolderIds(adapter, limits: OutlookBootstrapLimits) -> List[str]:
"""Prefer well-known folders (inbox, sentitems); fall back to browse()."""
folderIds: List[str] = []
for wellKnown in WELL_KNOWN_FOLDERS:
if len(folderIds) >= limits.maxFolders:
break
try:
row = await adapter._graphGet(f"me/mailFolders/{wellKnown}")
except Exception:
row = None
if isinstance(row, dict) and "error" not in row and row.get("id"):
folderIds.append(row["id"])
if len(folderIds) < limits.maxFolders:
try:
entries = await adapter.browse("/")
except Exception:
entries = []
for entry in entries:
metadata = getattr(entry, "metadata", {}) or {}
fid = metadata.get("id")
if fid and fid not in folderIds:
folderIds.append(fid)
if len(folderIds) >= limits.maxFolders:
break
return folderIds
async def _ingestFolder(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
folderId: str,
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
if remaining <= 0:
return
pageSize = min(100, remaining)
select = (
"id,subject,from,toRecipients,ccRecipients,receivedDateTime,"
"bodyPreview,body,internetMessageId,hasAttachments,changeKey"
)
endpoint: Optional[str] = (
f"me/mailFolders/{folderId}/messages"
f"?$top={pageSize}&$orderby=receivedDateTime desc&$select={select}"
)
# Keep header-based age filter in Graph itself to avoid shipping ancient
# messages we'd discard client-side.
if limits.maxAgeDays:
from datetime import datetime, timezone, timedelta
cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
cutoffIso = cutoff.strftime("%Y-%m-%dT%H:%M:%SZ")
endpoint = f"{endpoint}&$filter=receivedDateTime ge {cutoffIso}"
while endpoint and (result.indexed + result.skippedDuplicate) < limits.maxMessages:
try:
page = await adapter._graphGet(endpoint)
except Exception as exc:
logger.warning("outlook graph page failed for folder %s: %s", folderId, exc)
result.errors.append(f"graph({folderId}): {exc}")
return
if not isinstance(page, dict) or "error" in page:
err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
logger.warning("outlook graph page error for folder %s: %s", folderId, err)
result.errors.append(f"graph({folderId}): {err}")
return
for message in page.get("value", []) or []:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
await _ingestMessage(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
message=message,
limits=limits,
result=result,
progressCb=progressCb,
)
nextLink = page.get("@odata.nextLink")
if not nextLink:
break
# Strip Graph base so adapter._graphGet accepts the relative path.
from modules.connectors.providerMsft.connectorMsft import _stripGraphBase
endpoint = _stripGraphBase(nextLink)
async def _ingestMessage(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
message: Dict[str, Any],
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
messageId = message.get("id")
if not messageId:
result.skippedPolicy += 1
return
revision = message.get("changeKey") or message.get("internetMessageId")
subject = message.get("subject") or "(no subject)"
syntheticId = _syntheticMessageId(connectionId, messageId)
fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"
contentObjects = _buildContentObjects(message, limits.maxBodyChars)
# Always at least the header is emitted, so `contentObjects` is non-empty.
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="outlook_message",
sourceId=syntheticId,
fileName=fileName,
mimeType="message/rfc822",
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
provenance={
"connectionId": connectionId,
"authority": "msft",
"service": "outlook",
"externalItemId": messageId,
"internetMessageId": message.get("internetMessageId"),
"tier": "body",
},
)
)
except Exception as exc:
logger.error("outlook ingestion %s failed: %s", messageId, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({messageId}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if limits.includeAttachments and message.get("hasAttachments"):
try:
await _ingestAttachments(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
messageId=messageId,
parentSyntheticId=syntheticId,
limits=limits,
result=result,
)
except Exception as exc:
logger.warning("outlook attachments %s failed: %s", messageId, exc)
result.errors.append(f"attachments({messageId}): {exc}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
f"outlook processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=outlook processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "outlook",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
await asyncio.sleep(0)
async def _ingestAttachments(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
messageId: str,
parentSyntheticId: str,
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
) -> None:
"""Child ingestion jobs for file attachments (skip inline & oversized)."""
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
from modules.datamodels.datamodelExtraction import ExtractionOptions
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
import base64
page = await adapter._graphGet(f"me/messages/{messageId}/attachments")
if not isinstance(page, dict) or "error" in page:
return
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
for attachment in page.get("value", []) or []:
if attachment.get("@odata.type") != "#microsoft.graph.fileAttachment":
continue
if attachment.get("isInline"):
continue
size = int(attachment.get("size") or 0)
if size and size > limits.maxAttachmentBytes:
result.skippedPolicy += 1
continue
contentBytesB64 = attachment.get("contentBytes")
if not contentBytesB64:
continue
try:
rawBytes = base64.b64decode(contentBytesB64)
except Exception:
result.skippedPolicy += 1
continue
fileName = attachment.get("name") or "attachment"
mimeType = attachment.get("contentType") or "application/octet-stream"
attachmentId = attachment.get("id") or fileName
syntheticId = _syntheticAttachmentId(connectionId, messageId, attachmentId)
try:
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
rawBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("outlook attachment extract %s failed: %s", attachmentId, exc)
result.failed += 1
continue
contentObjects: List[Dict[str, Any]] = []
for part in getattr(extracted, "parts", None) or []:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
contentObjects.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "attachment",
**(getattr(part, "metadata", None) or {}),
},
})
if not contentObjects:
result.skippedPolicy += 1
continue
try:
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="outlook_attachment",
sourceId=syntheticId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
provenance={
"connectionId": connectionId,
"authority": "msft",
"service": "outlook",
"parentId": parentSyntheticId,
"externalItemId": attachmentId,
"parentMessageId": messageId,
},
)
)
result.attachmentsIndexed += 1
except Exception as exc:
logger.warning("outlook attachment ingest %s failed: %s", attachmentId, exc)
result.failed += 1
def _finalizeResult(connectionId: str, result: OutlookBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=outlook connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.attachmentsIndexed, result.failed, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "outlook",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
"errors": result.errors[:20],
}