- connection.established/revoked callbacks from OAuth routes and connection management endpoints - KnowledgeIngestionConsumer dispatches bootstrap job (established) and synchronous purge (revoked) - FileContentIndex: add connectionId + sourceKind columns - SharePoint bootstrap with @odata.nextLink pagination and eTag-based idempotency - Outlook bootstrap treats messages as virtual documents with cleanEmailBody for HTML/quote/signature stripping - fix(rag): lower buildAgentContext minScore thresholds from 0.55/0.65/0.70 to 0.35 — previous values blocked all real matches from text-embedding-3-small - 24 new unit tests covering purge, consumer dispatch, email cleaning and both bootstrap paths
190 lines
6.5 KiB
Python
190 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Bootstrap Outlook tests with a fake adapter + knowledge service.
|
|
|
|
Verifies:
|
|
- Well-known folders (inbox, sentitems) are discovered via Graph.
|
|
- Each message produces a `requestIngestion` call with sourceKind=outlook_message
|
|
and structured contentObjects (header / snippet / body).
|
|
- Pagination via `@odata.nextLink` is followed.
|
|
- changeKey is forwarded as contentVersion → idempotency.
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
from types import SimpleNamespace
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
|
|
|
|
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import (
|
|
bootstrapOutlook,
|
|
OutlookBootstrapLimits,
|
|
_syntheticMessageId,
|
|
_buildContentObjects,
|
|
)
|
|
|
|
|
|
class _FakeOutlookAdapter:
|
|
def __init__(self, messages_by_folder, paginated_folder=None, page2=None):
|
|
self._folders = {"inbox": "INBOX-ID", "sentitems": "SENT-ID"}
|
|
self._messages = messages_by_folder
|
|
self._paginated_folder = paginated_folder
|
|
self._page2 = page2 or []
|
|
self.requested_endpoints = []
|
|
|
|
async def _graphGet(self, endpoint: str):
|
|
self.requested_endpoints.append(endpoint)
|
|
if endpoint.startswith("me/mailFolders/") and "/messages" not in endpoint:
|
|
wellKnown = endpoint.split("/")[-1]
|
|
fid = self._folders.get(wellKnown)
|
|
if not fid:
|
|
return {"error": "not found"}
|
|
return {"id": fid, "displayName": wellKnown}
|
|
# message page request: e.g. me/mailFolders/INBOX-ID/messages?...
|
|
for fid, messages in self._messages.items():
|
|
if f"me/mailFolders/{fid}/messages" in endpoint:
|
|
page = {"value": messages}
|
|
if fid == self._paginated_folder and "skiptoken" not in endpoint:
|
|
page["@odata.nextLink"] = (
|
|
"https://graph.microsoft.com/v1.0/"
|
|
f"me/mailFolders/{fid}/messages?$skiptoken=abc"
|
|
)
|
|
elif fid == self._paginated_folder and "skiptoken" in endpoint:
|
|
page = {"value": self._page2}
|
|
return page
|
|
return {"value": []}
|
|
|
|
async def browse(self, path):
|
|
return []
|
|
|
|
|
|
class _FakeKnowledgeService:
|
|
def __init__(self, duplicateIds=None):
|
|
self.calls = []
|
|
self._duplicates = duplicateIds or set()
|
|
|
|
async def requestIngestion(self, job):
|
|
self.calls.append(job)
|
|
status = "duplicate" if job.sourceId in self._duplicates else "indexed"
|
|
return SimpleNamespace(
|
|
jobId=job.sourceId, status=status, contentHash="h",
|
|
fileId=job.sourceId, index=None, error=None,
|
|
)
|
|
|
|
|
|
def _msg(mid: str, subject: str = "Hi", change: str = "ck1"):
|
|
return {
|
|
"id": mid,
|
|
"subject": subject,
|
|
"from": {"emailAddress": {"name": "Alice", "address": "a@x.com"}},
|
|
"toRecipients": [{"emailAddress": {"name": "Bob", "address": "b@x.com"}}],
|
|
"ccRecipients": [],
|
|
"receivedDateTime": "2025-01-01T10:00:00Z",
|
|
"bodyPreview": "Hello world",
|
|
"body": {"contentType": "text", "content": "Hello world\nThis is the body."},
|
|
"internetMessageId": f"<{mid}@local>",
|
|
"hasAttachments": False,
|
|
"changeKey": change,
|
|
}
|
|
|
|
|
|
def test_buildContentObjects_emits_header_snippet_body():
|
|
parts = _buildContentObjects(_msg("m1"), maxBodyChars=8000)
|
|
ids = [p["contentObjectId"] for p in parts]
|
|
assert ids == ["header", "snippet", "body"]
|
|
header = parts[0]["data"]
|
|
assert "Subject: Hi" in header
|
|
assert "From: Alice <a@x.com>" in header
|
|
assert "To: Bob <b@x.com>" in header
|
|
|
|
|
|
def test_bootstrap_outlook_indexes_messages_from_inbox_and_sent():
|
|
adapter = _FakeOutlookAdapter({
|
|
"INBOX-ID": [_msg("m1"), _msg("m2")],
|
|
"SENT-ID": [_msg("m3")],
|
|
})
|
|
knowledge = _FakeKnowledgeService()
|
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
|
|
|
async def _run():
|
|
return await bootstrapOutlook(
|
|
connectionId="c1",
|
|
adapter=adapter,
|
|
connection=connection,
|
|
knowledgeService=knowledge,
|
|
limits=OutlookBootstrapLimits(maxAgeDays=None),
|
|
)
|
|
|
|
result = asyncio.run(_run())
|
|
assert result["indexed"] == 3
|
|
sourceIds = {c.sourceId for c in knowledge.calls}
|
|
assert sourceIds == {
|
|
_syntheticMessageId("c1", "m1"),
|
|
_syntheticMessageId("c1", "m2"),
|
|
_syntheticMessageId("c1", "m3"),
|
|
}
|
|
for job in knowledge.calls:
|
|
assert job.sourceKind == "outlook_message"
|
|
assert job.mimeType == "message/rfc822"
|
|
assert job.provenance["connectionId"] == "c1"
|
|
assert job.provenance["service"] == "outlook"
|
|
assert job.contentVersion == "ck1"
|
|
assert any(co["contentObjectId"] == "header" for co in job.contentObjects)
|
|
|
|
|
|
def test_bootstrap_outlook_follows_pagination():
|
|
adapter = _FakeOutlookAdapter(
|
|
messages_by_folder={"INBOX-ID": [_msg("m1")], "SENT-ID": []},
|
|
paginated_folder="INBOX-ID",
|
|
page2=[_msg("m2"), _msg("m3")],
|
|
)
|
|
knowledge = _FakeKnowledgeService()
|
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
|
|
|
async def _run():
|
|
return await bootstrapOutlook(
|
|
connectionId="c1",
|
|
adapter=adapter,
|
|
connection=connection,
|
|
knowledgeService=knowledge,
|
|
limits=OutlookBootstrapLimits(maxAgeDays=None),
|
|
)
|
|
|
|
result = asyncio.run(_run())
|
|
assert result["indexed"] == 3
|
|
|
|
|
|
def test_bootstrap_outlook_reports_duplicates():
|
|
adapter = _FakeOutlookAdapter({
|
|
"INBOX-ID": [_msg("m1"), _msg("m2")],
|
|
"SENT-ID": [],
|
|
})
|
|
duplicates = {
|
|
_syntheticMessageId("c1", "m1"),
|
|
_syntheticMessageId("c1", "m2"),
|
|
}
|
|
knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
|
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
|
|
|
async def _run():
|
|
return await bootstrapOutlook(
|
|
connectionId="c1",
|
|
adapter=adapter,
|
|
connection=connection,
|
|
knowledgeService=knowledge,
|
|
limits=OutlookBootstrapLimits(maxAgeDays=None),
|
|
)
|
|
|
|
result = asyncio.run(_run())
|
|
assert result["indexed"] == 0
|
|
assert result["skippedDuplicate"] == 2
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_buildContentObjects_emits_header_snippet_body()
|
|
test_bootstrap_outlook_indexes_messages_from_inbox_and_sent()
|
|
test_bootstrap_outlook_follows_pagination()
|
|
test_bootstrap_outlook_reports_duplicates()
|
|
print("OK — bootstrapOutlook tests passed")
|