#!/usr/bin/env python3 # Copyright (c) 2025 Patrick Motsch # All rights reserved. """Bootstrap Gmail tests with a fake googleGet + knowledge service. Verifies: - Default labels (INBOX + SENT) are traversed. - Each message produces a requestIngestion call with sourceKind=gmail_message and structured contentObjects (header / snippet / body). - Pagination via `nextPageToken` is followed. - historyId is forwarded as contentVersion → idempotency. - MIME body extraction walks nested parts (multipart/alternative). """ import asyncio import base64 import os import sys from types import SimpleNamespace sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import ( bootstrapGmail, GmailBootstrapLimits, _syntheticMessageId, _buildContentObjects, _walkPayloadForBody, ) def _b64url(text: str) -> str: return base64.urlsafe_b64encode(text.encode("utf-8")).decode("ascii").rstrip("=") def _msg(mid: str, subject: str = "Hi", body: str = "Hello world", historyId: str = "h1"): return { "id": mid, "threadId": f"thread-{mid}", "historyId": historyId, "internalDate": "1700000000000", "snippet": body[:120], "payload": { "headers": [ {"name": "Subject", "value": subject}, {"name": "From", "value": "Alice "}, {"name": "To", "value": "Bob "}, {"name": "Date", "value": "Tue, 01 Jan 2025 10:00:00 +0000"}, ], "mimeType": "text/plain", "body": {"data": _b64url(body), "size": len(body)}, "parts": [], }, } class _FakeGoogleGet: """Records URLs + returns the wired-up page or message response.""" def __init__(self, messages_by_label, paginated_label=None, page2=None): self._messages = messages_by_label self._paginated = paginated_label self._page2 = page2 or [] self._served_first_page = set() self.requested = [] async def __call__(self, url: str): self.requested.append(url) # List page: contains `/users/me/messages?labelIds=...` if "/users/me/messages?" in url: for label, msgs in self._messages.items(): if f"labelIds={label}" in url: if ( label == self._paginated and label not in self._served_first_page ): self._served_first_page.add(label) return { "messages": [{"id": m["id"]} for m in msgs], "nextPageToken": "token-2", } if label == self._paginated and "pageToken=token-2" in url: return { "messages": [{"id": m["id"]} for m in self._page2], } return {"messages": [{"id": m["id"]} for m in msgs]} return {"messages": []} # Detail fetch: /users/me/messages/{id}?format=full if "/users/me/messages/" in url and "format=full" in url: msgId = url.split("/users/me/messages/")[-1].split("?")[0] for msgs in self._messages.values(): for m in msgs: if m["id"] == msgId: return m for m in self._page2: if m["id"] == msgId: return m return {"error": "not found"} class _FakeKnowledgeService: def __init__(self, duplicateIds=None): self.calls = [] self._duplicates = duplicateIds or set() async def requestIngestion(self, job): self.calls.append(job) status = "duplicate" if job.sourceId in self._duplicates else "indexed" return SimpleNamespace( jobId=job.sourceId, status=status, contentHash="h", fileId=job.sourceId, index=None, error=None, ) def test_buildContentObjects_emits_header_snippet_body(): parts = _buildContentObjects(_msg("m1", body="Hello\nWorld"), maxBodyChars=8000) ids = [p["contentObjectId"] for p in parts] assert ids == ["header", "snippet", "body"] header = parts[0]["data"] assert "Subject: Hi" in header assert "From: Alice " in header assert "To: Bob " in header def test_walkPayloadForBody_prefers_plain_over_html(): payload = { "mimeType": "multipart/alternative", "parts": [ {"mimeType": "text/plain", "body": {"data": _b64url("plain body")}}, {"mimeType": "text/html", "body": {"data": _b64url("

html body

")}}, ], } bodies = _walkPayloadForBody(payload) assert bodies["text"] == "plain body" assert bodies["html"] == "

html body

" def test_walkPayloadForBody_falls_back_to_html(): payload = { "mimeType": "multipart/alternative", "parts": [ {"mimeType": "text/html", "body": {"data": _b64url("

only html

")}}, ], } bodies = _walkPayloadForBody(payload) assert bodies["text"] == "" assert "only html" in bodies["html"] def test_bootstrap_gmail_indexes_messages_from_inbox_and_sent(): fake_get = _FakeGoogleGet({ "INBOX": [_msg("m1"), _msg("m2")], "SENT": [_msg("m3")], }) knowledge = _FakeKnowledgeService() connection = SimpleNamespace(mandateId="m1", userId="u1") async def _run(): return await bootstrapGmail( connectionId="c1", adapter=SimpleNamespace(_token="t"), connection=connection, knowledgeService=knowledge, limits=GmailBootstrapLimits(maxAgeDays=None), googleGetFn=fake_get, ) result = asyncio.run(_run()) assert result["indexed"] == 3 sourceIds = {c.sourceId for c in knowledge.calls} assert sourceIds == { _syntheticMessageId("c1", "m1"), _syntheticMessageId("c1", "m2"), _syntheticMessageId("c1", "m3"), } for job in knowledge.calls: assert job.sourceKind == "gmail_message" assert job.mimeType == "message/rfc822" assert job.provenance["connectionId"] == "c1" assert job.provenance["authority"] == "google" assert job.provenance["service"] == "gmail" assert job.contentVersion == "h1" assert any(co["contentObjectId"] == "header" for co in job.contentObjects) def test_bootstrap_gmail_follows_pagination(): fake_get = _FakeGoogleGet( messages_by_label={"INBOX": [_msg("m1")], "SENT": []}, paginated_label="INBOX", page2=[_msg("m2"), _msg("m3")], ) knowledge = _FakeKnowledgeService() connection = SimpleNamespace(mandateId="m1", userId="u1") async def _run(): return await bootstrapGmail( connectionId="c1", adapter=SimpleNamespace(_token="t"), connection=connection, knowledgeService=knowledge, limits=GmailBootstrapLimits(maxAgeDays=None), googleGetFn=fake_get, ) result = asyncio.run(_run()) assert result["indexed"] == 3 def test_bootstrap_gmail_reports_duplicates(): fake_get = _FakeGoogleGet({"INBOX": [_msg("m1"), _msg("m2")], "SENT": []}) duplicates = { _syntheticMessageId("c1", "m1"), _syntheticMessageId("c1", "m2"), } knowledge = _FakeKnowledgeService(duplicateIds=duplicates) connection = SimpleNamespace(mandateId="m1", userId="u1") async def _run(): return await bootstrapGmail( connectionId="c1", adapter=SimpleNamespace(_token="t"), connection=connection, knowledgeService=knowledge, limits=GmailBootstrapLimits(maxAgeDays=None), googleGetFn=fake_get, ) result = asyncio.run(_run()) assert result["indexed"] == 0 assert result["skippedDuplicate"] == 2 if __name__ == "__main__": test_buildContentObjects_emits_header_snippet_body() test_walkPayloadForBody_prefers_plain_over_html() test_walkPayloadForBody_falls_back_to_html() test_bootstrap_gmail_indexes_messages_from_inbox_and_sent() test_bootstrap_gmail_follows_pagination() test_bootstrap_gmail_reports_duplicates() print("OK — bootstrapGmail tests passed")