240 lines
8.2 KiB
Python
240 lines
8.2 KiB
Python
#!/usr/bin/env python3
|
|
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Bootstrap Gmail tests with a fake googleGet + knowledge service.
|
|
|
|
Verifies:
|
|
- Default labels (INBOX + SENT) are traversed.
|
|
- Each message produces a requestIngestion call with sourceKind=gmail_message
|
|
and structured contentObjects (header / snippet / body).
|
|
- Pagination via `nextPageToken` is followed.
|
|
- historyId is forwarded as contentVersion → idempotency.
|
|
- MIME body extraction walks nested parts (multipart/alternative).
|
|
"""
|
|
|
|
import asyncio
|
|
import base64
|
|
import os
|
|
import sys
|
|
from types import SimpleNamespace
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
|
|
|
|
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
|
|
bootstrapGmail,
|
|
GmailBootstrapLimits,
|
|
_syntheticMessageId,
|
|
_buildContentObjects,
|
|
_walkPayloadForBody,
|
|
)
|
|
|
|
|
|
def _b64url(text: str) -> str:
|
|
return base64.urlsafe_b64encode(text.encode("utf-8")).decode("ascii").rstrip("=")
|
|
|
|
|
|
def _msg(mid: str, subject: str = "Hi", body: str = "Hello world", historyId: str = "h1"):
|
|
return {
|
|
"id": mid,
|
|
"threadId": f"thread-{mid}",
|
|
"historyId": historyId,
|
|
"internalDate": "1700000000000",
|
|
"snippet": body[:120],
|
|
"payload": {
|
|
"headers": [
|
|
{"name": "Subject", "value": subject},
|
|
{"name": "From", "value": "Alice <a@x.com>"},
|
|
{"name": "To", "value": "Bob <b@x.com>"},
|
|
{"name": "Date", "value": "Tue, 01 Jan 2025 10:00:00 +0000"},
|
|
],
|
|
"mimeType": "text/plain",
|
|
"body": {"data": _b64url(body), "size": len(body)},
|
|
"parts": [],
|
|
},
|
|
}
|
|
|
|
|
|
class _FakeGoogleGet:
|
|
"""Records URLs + returns the wired-up page or message response."""
|
|
|
|
def __init__(self, messages_by_label, paginated_label=None, page2=None):
|
|
self._messages = messages_by_label
|
|
self._paginated = paginated_label
|
|
self._page2 = page2 or []
|
|
self._served_first_page = set()
|
|
self.requested = []
|
|
|
|
async def __call__(self, url: str):
|
|
self.requested.append(url)
|
|
# List page: contains `/users/me/messages?labelIds=...`
|
|
if "/users/me/messages?" in url:
|
|
for label, msgs in self._messages.items():
|
|
if f"labelIds={label}" in url:
|
|
if (
|
|
label == self._paginated
|
|
and label not in self._served_first_page
|
|
):
|
|
self._served_first_page.add(label)
|
|
return {
|
|
"messages": [{"id": m["id"]} for m in msgs],
|
|
"nextPageToken": "token-2",
|
|
}
|
|
if label == self._paginated and "pageToken=token-2" in url:
|
|
return {
|
|
"messages": [{"id": m["id"]} for m in self._page2],
|
|
}
|
|
return {"messages": [{"id": m["id"]} for m in msgs]}
|
|
return {"messages": []}
|
|
# Detail fetch: /users/me/messages/{id}?format=full
|
|
if "/users/me/messages/" in url and "format=full" in url:
|
|
msgId = url.split("/users/me/messages/")[-1].split("?")[0]
|
|
for msgs in self._messages.values():
|
|
for m in msgs:
|
|
if m["id"] == msgId:
|
|
return m
|
|
for m in self._page2:
|
|
if m["id"] == msgId:
|
|
return m
|
|
return {"error": "not found"}
|
|
|
|
|
|
class _FakeKnowledgeService:
|
|
def __init__(self, duplicateIds=None):
|
|
self.calls = []
|
|
self._duplicates = duplicateIds or set()
|
|
|
|
async def requestIngestion(self, job):
|
|
self.calls.append(job)
|
|
status = "duplicate" if job.sourceId in self._duplicates else "indexed"
|
|
return SimpleNamespace(
|
|
jobId=job.sourceId, status=status, contentHash="h",
|
|
fileId=job.sourceId, index=None, error=None,
|
|
)
|
|
|
|
|
|
def test_buildContentObjects_emits_header_snippet_body():
|
|
parts = _buildContentObjects(_msg("m1", body="Hello\nWorld"), maxBodyChars=8000)
|
|
ids = [p["contentObjectId"] for p in parts]
|
|
assert ids == ["header", "snippet", "body"]
|
|
header = parts[0]["data"]
|
|
assert "Subject: Hi" in header
|
|
assert "From: Alice <a@x.com>" in header
|
|
assert "To: Bob <b@x.com>" in header
|
|
|
|
|
|
def test_walkPayloadForBody_prefers_plain_over_html():
|
|
payload = {
|
|
"mimeType": "multipart/alternative",
|
|
"parts": [
|
|
{"mimeType": "text/plain", "body": {"data": _b64url("plain body")}},
|
|
{"mimeType": "text/html", "body": {"data": _b64url("<p>html body</p>")}},
|
|
],
|
|
}
|
|
bodies = _walkPayloadForBody(payload)
|
|
assert bodies["text"] == "plain body"
|
|
assert bodies["html"] == "<p>html body</p>"
|
|
|
|
|
|
def test_walkPayloadForBody_falls_back_to_html():
|
|
payload = {
|
|
"mimeType": "multipart/alternative",
|
|
"parts": [
|
|
{"mimeType": "text/html", "body": {"data": _b64url("<p>only html</p>")}},
|
|
],
|
|
}
|
|
bodies = _walkPayloadForBody(payload)
|
|
assert bodies["text"] == ""
|
|
assert "only html" in bodies["html"]
|
|
|
|
|
|
def test_bootstrap_gmail_indexes_messages_from_inbox_and_sent():
|
|
fake_get = _FakeGoogleGet({
|
|
"INBOX": [_msg("m1"), _msg("m2")],
|
|
"SENT": [_msg("m3")],
|
|
})
|
|
knowledge = _FakeKnowledgeService()
|
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
|
|
|
async def _run():
|
|
return await bootstrapGmail(
|
|
connectionId="c1",
|
|
adapter=SimpleNamespace(_token="t"),
|
|
connection=connection,
|
|
knowledgeService=knowledge,
|
|
limits=GmailBootstrapLimits(maxAgeDays=None),
|
|
googleGetFn=fake_get,
|
|
)
|
|
|
|
result = asyncio.run(_run())
|
|
assert result["indexed"] == 3
|
|
sourceIds = {c.sourceId for c in knowledge.calls}
|
|
assert sourceIds == {
|
|
_syntheticMessageId("c1", "m1"),
|
|
_syntheticMessageId("c1", "m2"),
|
|
_syntheticMessageId("c1", "m3"),
|
|
}
|
|
for job in knowledge.calls:
|
|
assert job.sourceKind == "gmail_message"
|
|
assert job.mimeType == "message/rfc822"
|
|
assert job.provenance["connectionId"] == "c1"
|
|
assert job.provenance["authority"] == "google"
|
|
assert job.provenance["service"] == "gmail"
|
|
assert job.contentVersion == "h1"
|
|
assert any(co["contentObjectId"] == "header" for co in job.contentObjects)
|
|
|
|
|
|
def test_bootstrap_gmail_follows_pagination():
|
|
fake_get = _FakeGoogleGet(
|
|
messages_by_label={"INBOX": [_msg("m1")], "SENT": []},
|
|
paginated_label="INBOX",
|
|
page2=[_msg("m2"), _msg("m3")],
|
|
)
|
|
knowledge = _FakeKnowledgeService()
|
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
|
|
|
async def _run():
|
|
return await bootstrapGmail(
|
|
connectionId="c1",
|
|
adapter=SimpleNamespace(_token="t"),
|
|
connection=connection,
|
|
knowledgeService=knowledge,
|
|
limits=GmailBootstrapLimits(maxAgeDays=None),
|
|
googleGetFn=fake_get,
|
|
)
|
|
|
|
result = asyncio.run(_run())
|
|
assert result["indexed"] == 3
|
|
|
|
|
|
def test_bootstrap_gmail_reports_duplicates():
|
|
fake_get = _FakeGoogleGet({"INBOX": [_msg("m1"), _msg("m2")], "SENT": []})
|
|
duplicates = {
|
|
_syntheticMessageId("c1", "m1"),
|
|
_syntheticMessageId("c1", "m2"),
|
|
}
|
|
knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
|
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
|
|
|
async def _run():
|
|
return await bootstrapGmail(
|
|
connectionId="c1",
|
|
adapter=SimpleNamespace(_token="t"),
|
|
connection=connection,
|
|
knowledgeService=knowledge,
|
|
limits=GmailBootstrapLimits(maxAgeDays=None),
|
|
googleGetFn=fake_get,
|
|
)
|
|
|
|
result = asyncio.run(_run())
|
|
assert result["indexed"] == 0
|
|
assert result["skippedDuplicate"] == 2
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_buildContentObjects_emits_header_snippet_body()
|
|
test_walkPayloadForBody_prefers_plain_over_html()
|
|
test_walkPayloadForBody_falls_back_to_html()
|
|
test_bootstrap_gmail_indexes_messages_from_inbox_and_sent()
|
|
test_bootstrap_gmail_follows_pagination()
|
|
test_bootstrap_gmail_reports_duplicates()
|
|
print("OK — bootstrapGmail tests passed")
|