gateway/tests/unit/services/test_bootstrap_outlook.py
Ida 6a5ff1ff7c feat(rag): P1 user-connection hooks + retrieval threshold fix
- connection.established/revoked callbacks from OAuth routes and
  connection management endpoints
- KnowledgeIngestionConsumer dispatches bootstrap job (established)
  and synchronous purge (revoked)
- FileContentIndex: add connectionId + sourceKind columns
- SharePoint bootstrap with @odata.nextLink pagination and eTag-based
  idempotency
- Outlook bootstrap treats messages as virtual documents with
  cleanEmailBody for HTML/quote/signature stripping
- fix(rag): lower buildAgentContext minScore thresholds from
  0.55/0.65/0.70 to 0.35 — previous values blocked all real matches
  from text-embedding-3-small
- 24 new unit tests covering purge, consumer dispatch, email cleaning
  and both bootstrap paths
2026-04-29 14:39:40 +02:00

190 lines
6.5 KiB
Python

#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Bootstrap Outlook tests with a fake adapter + knowledge service.
Verifies:
- Well-known folders (inbox, sentitems) are discovered via Graph.
- Each message produces a `requestIngestion` call with sourceKind=outlook_message
and structured contentObjects (header / snippet / body).
- Pagination via `@odata.nextLink` is followed.
- changeKey is forwarded as contentVersion → idempotency.
"""
import asyncio
import os
import sys
from types import SimpleNamespace
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import (
bootstrapOutlook,
OutlookBootstrapLimits,
_syntheticMessageId,
_buildContentObjects,
)
class _FakeOutlookAdapter:
def __init__(self, messages_by_folder, paginated_folder=None, page2=None):
self._folders = {"inbox": "INBOX-ID", "sentitems": "SENT-ID"}
self._messages = messages_by_folder
self._paginated_folder = paginated_folder
self._page2 = page2 or []
self.requested_endpoints = []
async def _graphGet(self, endpoint: str):
self.requested_endpoints.append(endpoint)
if endpoint.startswith("me/mailFolders/") and "/messages" not in endpoint:
wellKnown = endpoint.split("/")[-1]
fid = self._folders.get(wellKnown)
if not fid:
return {"error": "not found"}
return {"id": fid, "displayName": wellKnown}
# message page request: e.g. me/mailFolders/INBOX-ID/messages?...
for fid, messages in self._messages.items():
if f"me/mailFolders/{fid}/messages" in endpoint:
page = {"value": messages}
if fid == self._paginated_folder and "skiptoken" not in endpoint:
page["@odata.nextLink"] = (
"https://graph.microsoft.com/v1.0/"
f"me/mailFolders/{fid}/messages?$skiptoken=abc"
)
elif fid == self._paginated_folder and "skiptoken" in endpoint:
page = {"value": self._page2}
return page
return {"value": []}
async def browse(self, path):
return []
class _FakeKnowledgeService:
def __init__(self, duplicateIds=None):
self.calls = []
self._duplicates = duplicateIds or set()
async def requestIngestion(self, job):
self.calls.append(job)
status = "duplicate" if job.sourceId in self._duplicates else "indexed"
return SimpleNamespace(
jobId=job.sourceId, status=status, contentHash="h",
fileId=job.sourceId, index=None, error=None,
)
def _msg(mid: str, subject: str = "Hi", change: str = "ck1"):
return {
"id": mid,
"subject": subject,
"from": {"emailAddress": {"name": "Alice", "address": "a@x.com"}},
"toRecipients": [{"emailAddress": {"name": "Bob", "address": "b@x.com"}}],
"ccRecipients": [],
"receivedDateTime": "2025-01-01T10:00:00Z",
"bodyPreview": "Hello world",
"body": {"contentType": "text", "content": "Hello world\nThis is the body."},
"internetMessageId": f"<{mid}@local>",
"hasAttachments": False,
"changeKey": change,
}
def test_buildContentObjects_emits_header_snippet_body():
parts = _buildContentObjects(_msg("m1"), maxBodyChars=8000)
ids = [p["contentObjectId"] for p in parts]
assert ids == ["header", "snippet", "body"]
header = parts[0]["data"]
assert "Subject: Hi" in header
assert "From: Alice <a@x.com>" in header
assert "To: Bob <b@x.com>" in header
def test_bootstrap_outlook_indexes_messages_from_inbox_and_sent():
adapter = _FakeOutlookAdapter({
"INBOX-ID": [_msg("m1"), _msg("m2")],
"SENT-ID": [_msg("m3")],
})
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapOutlook(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
limits=OutlookBootstrapLimits(maxAgeDays=None),
)
result = asyncio.run(_run())
assert result["indexed"] == 3
sourceIds = {c.sourceId for c in knowledge.calls}
assert sourceIds == {
_syntheticMessageId("c1", "m1"),
_syntheticMessageId("c1", "m2"),
_syntheticMessageId("c1", "m3"),
}
for job in knowledge.calls:
assert job.sourceKind == "outlook_message"
assert job.mimeType == "message/rfc822"
assert job.provenance["connectionId"] == "c1"
assert job.provenance["service"] == "outlook"
assert job.contentVersion == "ck1"
assert any(co["contentObjectId"] == "header" for co in job.contentObjects)
def test_bootstrap_outlook_follows_pagination():
adapter = _FakeOutlookAdapter(
messages_by_folder={"INBOX-ID": [_msg("m1")], "SENT-ID": []},
paginated_folder="INBOX-ID",
page2=[_msg("m2"), _msg("m3")],
)
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapOutlook(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
limits=OutlookBootstrapLimits(maxAgeDays=None),
)
result = asyncio.run(_run())
assert result["indexed"] == 3
def test_bootstrap_outlook_reports_duplicates():
adapter = _FakeOutlookAdapter({
"INBOX-ID": [_msg("m1"), _msg("m2")],
"SENT-ID": [],
})
duplicates = {
_syntheticMessageId("c1", "m1"),
_syntheticMessageId("c1", "m2"),
}
knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapOutlook(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
limits=OutlookBootstrapLimits(maxAgeDays=None),
)
result = asyncio.run(_run())
assert result["indexed"] == 0
assert result["skippedDuplicate"] == 2
if __name__ == "__main__":
test_buildContentObjects_emits_header_snippet_body()
test_bootstrap_outlook_indexes_messages_from_inbox_and_sent()
test_bootstrap_outlook_follows_pagination()
test_bootstrap_outlook_reports_duplicates()
print("OK — bootstrapOutlook tests passed")