gateway/tests/unit/services/test_bootstrap_gmail.py
2026-04-29 14:39:40 +02:00

240 lines
8.2 KiB
Python

#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Bootstrap Gmail tests with a fake googleGet + knowledge service.
Verifies:
- Default labels (INBOX + SENT) are traversed.
- Each message produces a requestIngestion call with sourceKind=gmail_message
and structured contentObjects (header / snippet / body).
- Pagination via `nextPageToken` is followed.
- historyId is forwarded as contentVersion → idempotency.
- MIME body extraction walks nested parts (multipart/alternative).
"""
import asyncio
import base64
import os
import sys
from types import SimpleNamespace
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
bootstrapGmail,
GmailBootstrapLimits,
_syntheticMessageId,
_buildContentObjects,
_walkPayloadForBody,
)
def _b64url(text: str) -> str:
return base64.urlsafe_b64encode(text.encode("utf-8")).decode("ascii").rstrip("=")
def _msg(mid: str, subject: str = "Hi", body: str = "Hello world", historyId: str = "h1"):
return {
"id": mid,
"threadId": f"thread-{mid}",
"historyId": historyId,
"internalDate": "1700000000000",
"snippet": body[:120],
"payload": {
"headers": [
{"name": "Subject", "value": subject},
{"name": "From", "value": "Alice <a@x.com>"},
{"name": "To", "value": "Bob <b@x.com>"},
{"name": "Date", "value": "Tue, 01 Jan 2025 10:00:00 +0000"},
],
"mimeType": "text/plain",
"body": {"data": _b64url(body), "size": len(body)},
"parts": [],
},
}
class _FakeGoogleGet:
"""Records URLs + returns the wired-up page or message response."""
def __init__(self, messages_by_label, paginated_label=None, page2=None):
self._messages = messages_by_label
self._paginated = paginated_label
self._page2 = page2 or []
self._served_first_page = set()
self.requested = []
async def __call__(self, url: str):
self.requested.append(url)
# List page: contains `/users/me/messages?labelIds=...`
if "/users/me/messages?" in url:
for label, msgs in self._messages.items():
if f"labelIds={label}" in url:
if (
label == self._paginated
and label not in self._served_first_page
):
self._served_first_page.add(label)
return {
"messages": [{"id": m["id"]} for m in msgs],
"nextPageToken": "token-2",
}
if label == self._paginated and "pageToken=token-2" in url:
return {
"messages": [{"id": m["id"]} for m in self._page2],
}
return {"messages": [{"id": m["id"]} for m in msgs]}
return {"messages": []}
# Detail fetch: /users/me/messages/{id}?format=full
if "/users/me/messages/" in url and "format=full" in url:
msgId = url.split("/users/me/messages/")[-1].split("?")[0]
for msgs in self._messages.values():
for m in msgs:
if m["id"] == msgId:
return m
for m in self._page2:
if m["id"] == msgId:
return m
return {"error": "not found"}
class _FakeKnowledgeService:
def __init__(self, duplicateIds=None):
self.calls = []
self._duplicates = duplicateIds or set()
async def requestIngestion(self, job):
self.calls.append(job)
status = "duplicate" if job.sourceId in self._duplicates else "indexed"
return SimpleNamespace(
jobId=job.sourceId, status=status, contentHash="h",
fileId=job.sourceId, index=None, error=None,
)
def test_buildContentObjects_emits_header_snippet_body():
parts = _buildContentObjects(_msg("m1", body="Hello\nWorld"), maxBodyChars=8000)
ids = [p["contentObjectId"] for p in parts]
assert ids == ["header", "snippet", "body"]
header = parts[0]["data"]
assert "Subject: Hi" in header
assert "From: Alice <a@x.com>" in header
assert "To: Bob <b@x.com>" in header
def test_walkPayloadForBody_prefers_plain_over_html():
payload = {
"mimeType": "multipart/alternative",
"parts": [
{"mimeType": "text/plain", "body": {"data": _b64url("plain body")}},
{"mimeType": "text/html", "body": {"data": _b64url("<p>html body</p>")}},
],
}
bodies = _walkPayloadForBody(payload)
assert bodies["text"] == "plain body"
assert bodies["html"] == "<p>html body</p>"
def test_walkPayloadForBody_falls_back_to_html():
payload = {
"mimeType": "multipart/alternative",
"parts": [
{"mimeType": "text/html", "body": {"data": _b64url("<p>only html</p>")}},
],
}
bodies = _walkPayloadForBody(payload)
assert bodies["text"] == ""
assert "only html" in bodies["html"]
def test_bootstrap_gmail_indexes_messages_from_inbox_and_sent():
fake_get = _FakeGoogleGet({
"INBOX": [_msg("m1"), _msg("m2")],
"SENT": [_msg("m3")],
})
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapGmail(
connectionId="c1",
adapter=SimpleNamespace(_token="t"),
connection=connection,
knowledgeService=knowledge,
limits=GmailBootstrapLimits(maxAgeDays=None),
googleGetFn=fake_get,
)
result = asyncio.run(_run())
assert result["indexed"] == 3
sourceIds = {c.sourceId for c in knowledge.calls}
assert sourceIds == {
_syntheticMessageId("c1", "m1"),
_syntheticMessageId("c1", "m2"),
_syntheticMessageId("c1", "m3"),
}
for job in knowledge.calls:
assert job.sourceKind == "gmail_message"
assert job.mimeType == "message/rfc822"
assert job.provenance["connectionId"] == "c1"
assert job.provenance["authority"] == "google"
assert job.provenance["service"] == "gmail"
assert job.contentVersion == "h1"
assert any(co["contentObjectId"] == "header" for co in job.contentObjects)
def test_bootstrap_gmail_follows_pagination():
fake_get = _FakeGoogleGet(
messages_by_label={"INBOX": [_msg("m1")], "SENT": []},
paginated_label="INBOX",
page2=[_msg("m2"), _msg("m3")],
)
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapGmail(
connectionId="c1",
adapter=SimpleNamespace(_token="t"),
connection=connection,
knowledgeService=knowledge,
limits=GmailBootstrapLimits(maxAgeDays=None),
googleGetFn=fake_get,
)
result = asyncio.run(_run())
assert result["indexed"] == 3
def test_bootstrap_gmail_reports_duplicates():
fake_get = _FakeGoogleGet({"INBOX": [_msg("m1"), _msg("m2")], "SENT": []})
duplicates = {
_syntheticMessageId("c1", "m1"),
_syntheticMessageId("c1", "m2"),
}
knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapGmail(
connectionId="c1",
adapter=SimpleNamespace(_token="t"),
connection=connection,
knowledgeService=knowledge,
limits=GmailBootstrapLimits(maxAgeDays=None),
googleGetFn=fake_get,
)
result = asyncio.run(_run())
assert result["indexed"] == 0
assert result["skippedDuplicate"] == 2
if __name__ == "__main__":
test_buildContentObjects_emits_header_snippet_body()
test_walkPayloadForBody_prefers_plain_over_html()
test_walkPayloadForBody_falls_back_to_html()
test_bootstrap_gmail_indexes_messages_from_inbox_and_sent()
test_bootstrap_gmail_follows_pagination()
test_bootstrap_gmail_reports_duplicates()
print("OK — bootstrapGmail tests passed")