# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Gmail bootstrap for the unified knowledge ingestion lane. Mirrors the Outlook pilot (see subConnectorSyncOutlook.py) but talks to Google Mail's REST API. Messages become `sourceKind="gmail_message"` virtual documents with header / snippet / cleaned body content-objects; attachments are optional child jobs with `sourceKind="gmail_attachment"`. Idempotency: Gmail's stable `historyId` (or `internalDate` as fallback) is passed as `contentVersion`, so rerunning the bootstrap yields `ingestion.skipped.duplicate` for unchanged messages. """ from __future__ import annotations import asyncio import base64 import hashlib import logging import time from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone from typing import Any, Callable, Dict, List, Optional from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody logger = logging.getLogger(__name__) MAX_MESSAGES_DEFAULT = 500 MAX_BODY_CHARS_DEFAULT = 8000 MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024 DEFAULT_LABELS = ("INBOX", "SENT") @dataclass class GmailBootstrapLimits: maxMessages: int = MAX_MESSAGES_DEFAULT labels: tuple = DEFAULT_LABELS maxBodyChars: int = MAX_BODY_CHARS_DEFAULT includeAttachments: bool = False maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT # Only fetch messages newer than N days. None disables filter. maxAgeDays: Optional[int] = 90 # Content depth: "metadata" | "snippet" | "full" mailContentDepth: str = "full" # Pass-through to IngestionJob.neutralize neutralize: bool = False @dataclass class GmailBootstrapResult: connectionId: str indexed: int = 0 skippedDuplicate: int = 0 skippedPolicy: int = 0 failed: int = 0 attachmentsIndexed: int = 0 errors: List[str] = field(default_factory=list) def _syntheticMessageId(connectionId: str, messageId: str) -> str: token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16] return f"gm:{connectionId[:8]}:{token}" def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str: token = hashlib.sha256( f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8") ).hexdigest()[:16] return f"ga:{connectionId[:8]}:{token}" def _decodeBase64Url(data: str) -> bytes: if not data: return b"" # Gmail uses URL-safe base64 without padding. padding = 4 - (len(data) % 4) if padding != 4: data = data + ("=" * padding) try: return base64.urlsafe_b64decode(data) except Exception: return b"" def _walkPayloadForBody(payload: Dict[str, Any]) -> Dict[str, str]: """Return {"text": ..., "html": ...} by walking MIME parts. Gmail `payload` is a tree of parts. We prefer `text/plain` for the cleaned body, but capture `text/html` as a fallback so `cleanEmailBody` can strip markup if plain is missing. """ found: Dict[str, str] = {"text": "", "html": ""} def _walk(part: Dict[str, Any]) -> None: mime = (part.get("mimeType") or "").lower() body = part.get("body") or {} raw = body.get("data") or "" if raw and mime.startswith("text/"): decoded = _decodeBase64Url(raw).decode("utf-8", errors="replace") key = "text" if mime == "text/plain" else ("html" if mime == "text/html" else "") if key and not found[key]: found[key] = decoded for sub in part.get("parts") or []: _walk(sub) _walk(payload or {}) return found def _headerMap(payload: Dict[str, Any]) -> Dict[str, str]: return { (h.get("name") or "").lower(): (h.get("value") or "") for h in (payload.get("headers") or []) } def _buildContentObjects( message: Dict[str, Any], maxBodyChars: int, mailContentDepth: str = "full", ) -> List[Dict[str, Any]]: """Build content objects for a Gmail message. `mailContentDepth` controls how much is embedded: - "metadata": header only (subject, from, to, date) - "snippet": header + Gmail snippet (~155 chars, no full body) - "full": header + snippet + cleaned full body (default) """ payload = message.get("payload") or {} headers = _headerMap(payload) subject = headers.get("subject") or "(no subject)" fromAddr = headers.get("from") or "" toAddr = headers.get("to") or "" ccAddr = headers.get("cc") or "" date = headers.get("date") or "" snippet = message.get("snippet") or "" parts: List[Dict[str, Any]] = [] header = ( f"Subject: {subject}\n" f"From: {fromAddr}\n" f"To: {toAddr}\n" + (f"Cc: {ccAddr}\n" if ccAddr else "") + f"Date: {date}" ) parts.append({ "contentObjectId": "header", "contentType": "text", "data": header, "contextRef": {"part": "header"}, }) if mailContentDepth in ("snippet", "full") and snippet: parts.append({ "contentObjectId": "snippet", "contentType": "text", "data": snippet, "contextRef": {"part": "snippet"}, }) if mailContentDepth == "full": bodies = _walkPayloadForBody(payload) rawBody = bodies["text"] or bodies["html"] cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else "" if cleanedBody: parts.append({ "contentObjectId": "body", "contentType": "text", "data": cleanedBody, "contextRef": {"part": "body"}, }) return parts async def bootstrapGmail( connectionId: str, *, progressCb: Optional[Callable[[int, Optional[str]], None]] = None, adapter: Any = None, connection: Any = None, knowledgeService: Any = None, limits: Optional[GmailBootstrapLimits] = None, googleGetFn: Optional[Callable[..., Any]] = None, ) -> Dict[str, Any]: """Enumerate Gmail labels (INBOX + SENT default) and ingest messages.""" from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs prefs = loadConnectionPrefs(connectionId) if not limits: limits = GmailBootstrapLimits( includeAttachments=prefs.mailIndexAttachments, maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, mailContentDepth=prefs.mailContentDepth, neutralize=prefs.neutralizeBeforeEmbed, ) startMs = time.time() result = GmailBootstrapResult(connectionId=connectionId) logger.info( "ingestion.connection.bootstrap.started part=gmail connectionId=%s", connectionId, extra={ "event": "ingestion.connection.bootstrap.started", "part": "gmail", "connectionId": connectionId, }, ) if adapter is None or knowledgeService is None or connection is None: adapter, connection, knowledgeService = await _resolveDependencies(connectionId) if googleGetFn is None: from modules.connectors.providerGoogle.connectorGoogle import _googleGet as _defaultGet token = getattr(adapter, "_token", "") async def googleGetFn(url: str) -> Dict[str, Any]: # type: ignore[no-redef] return await _defaultGet(token, url) mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" for labelId in limits.labels: if result.indexed + result.skippedDuplicate >= limits.maxMessages: break try: await _ingestLabel( googleGetFn=googleGetFn, knowledgeService=knowledgeService, connectionId=connectionId, mandateId=mandateId, userId=userId, labelId=labelId, limits=limits, result=result, progressCb=progressCb, ) except Exception as exc: logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True) result.errors.append(f"label({labelId}): {exc}") return _finalizeResult(connectionId, result, startMs) async def _resolveDependencies(connectionId: str): from modules.interfaces.interfaceDbApp import getRootInterface from modules.auth import TokenManager from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector from modules.serviceCenter import getService from modules.serviceCenter.context import ServiceCenterContext from modules.security.rootAccess import getRootUser rootInterface = getRootInterface() connection = rootInterface.getUserConnectionById(connectionId) if connection is None: raise ValueError(f"UserConnection not found: {connectionId}") token = TokenManager().getFreshToken(connectionId) if not token or not token.tokenAccess: raise ValueError(f"No valid token for connection {connectionId}") provider = GoogleConnector(connection, token.tokenAccess) adapter = provider.getServiceAdapter("gmail") rootUser = getRootUser() ctx = ServiceCenterContext( user=rootUser, mandate_id=str(getattr(connection, "mandateId", "") or ""), ) knowledgeService = getService("knowledge", ctx) return adapter, connection, knowledgeService async def _ingestLabel( *, googleGetFn, knowledgeService, connectionId: str, mandateId: str, userId: str, labelId: str, limits: GmailBootstrapLimits, result: GmailBootstrapResult, progressCb: Optional[Callable[[int, Optional[str]], None]], ) -> None: remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate) if remaining <= 0: return pageSize = min(100, remaining) query = "" if limits.maxAgeDays: cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays) # Gmail uses YYYY/MM/DD. query = f"after:{cutoff.strftime('%Y/%m/%d')}" baseUrl = ( "https://gmail.googleapis.com/gmail/v1/users/me/messages" f"?labelIds={labelId}&maxResults={pageSize}" ) if query: baseUrl = f"{baseUrl}&q={query}" nextPageToken: Optional[str] = None while (result.indexed + result.skippedDuplicate) < limits.maxMessages: url = baseUrl if not nextPageToken else f"{baseUrl}&pageToken={nextPageToken}" page = await googleGetFn(url) if not isinstance(page, dict) or "error" in page: err = (page or {}).get("error") if isinstance(page, dict) else "unknown" logger.warning("gmail list page error for label %s: %s", labelId, err) result.errors.append(f"list({labelId}): {err}") return messageStubs = page.get("messages") or [] for stub in messageStubs: if result.indexed + result.skippedDuplicate >= limits.maxMessages: break msgId = stub.get("id") if not msgId: continue detailUrl = ( f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{msgId}?format=full" ) detail = await googleGetFn(detailUrl) if not isinstance(detail, dict) or "error" in detail: result.failed += 1 continue await _ingestMessage( googleGetFn=googleGetFn, knowledgeService=knowledgeService, connectionId=connectionId, mandateId=mandateId, userId=userId, labelId=labelId, message=detail, limits=limits, result=result, progressCb=progressCb, ) nextPageToken = page.get("nextPageToken") if not nextPageToken: break async def _ingestMessage( *, googleGetFn, knowledgeService, connectionId: str, mandateId: str, userId: str, labelId: str, message: Dict[str, Any], limits: GmailBootstrapLimits, result: GmailBootstrapResult, progressCb: Optional[Callable[[int, Optional[str]], None]], ) -> None: from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob messageId = message.get("id") if not messageId: result.skippedPolicy += 1 return revision = message.get("historyId") or message.get("internalDate") headers = _headerMap(message.get("payload") or {}) subject = headers.get("subject") or "(no subject)" syntheticId = _syntheticMessageId(connectionId, messageId) fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml" contentObjects = _buildContentObjects( message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth ) try: handle = await knowledgeService.requestIngestion( IngestionJob( sourceKind="gmail_message", sourceId=syntheticId, fileName=fileName, mimeType="message/rfc822", userId=userId, mandateId=mandateId, contentObjects=contentObjects, contentVersion=str(revision) if revision else None, neutralize=limits.neutralize, provenance={ "connectionId": connectionId, "authority": "google", "service": "gmail", "externalItemId": messageId, "label": labelId, "threadId": message.get("threadId"), "tier": limits.mailContentDepth, }, ) ) except Exception as exc: logger.error("gmail ingestion %s failed: %s", messageId, exc, exc_info=True) result.failed += 1 result.errors.append(f"ingest({messageId}): {exc}") return if handle.status == "duplicate": result.skippedDuplicate += 1 elif handle.status == "indexed": result.indexed += 1 else: result.failed += 1 if limits.includeAttachments: try: await _ingestAttachments( googleGetFn=googleGetFn, knowledgeService=knowledgeService, connectionId=connectionId, mandateId=mandateId, userId=userId, message=message, parentSyntheticId=syntheticId, limits=limits, result=result, ) except Exception as exc: logger.warning("gmail attachments %s failed: %s", messageId, exc) result.errors.append(f"attachments({messageId}): {exc}") if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: processed = result.indexed + result.skippedDuplicate try: progressCb( min(90, 10 + int(80 * processed / max(1, limits.maxMessages))), f"gmail processed={processed}", ) except Exception: pass logger.info( "ingestion.connection.bootstrap.progress part=gmail processed=%d skippedDup=%d failed=%d", processed, result.skippedDuplicate, result.failed, extra={ "event": "ingestion.connection.bootstrap.progress", "part": "gmail", "connectionId": connectionId, "processed": processed, "skippedDup": result.skippedDuplicate, "failed": result.failed, }, ) await asyncio.sleep(0) async def _ingestAttachments( *, googleGetFn, knowledgeService, connectionId: str, mandateId: str, userId: str, message: Dict[str, Any], parentSyntheticId: str, limits: GmailBootstrapLimits, result: GmailBootstrapResult, ) -> None: """Child ingestion jobs for file attachments. Skips inline images (cid: refs).""" from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob from modules.datamodels.datamodelExtraction import ExtractionOptions from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction from modules.serviceCenter.services.serviceExtraction.subRegistry import ( ExtractorRegistry, ChunkerRegistry, ) messageId = message.get("id") or "" def _collectAttachmentStubs(part: Dict[str, Any], acc: List[Dict[str, Any]]) -> None: filename = part.get("filename") or "" body = part.get("body") or {} attId = body.get("attachmentId") if filename and attId: acc.append({ "filename": filename, "mimeType": part.get("mimeType") or "application/octet-stream", "attachmentId": attId, "size": int(body.get("size") or 0), }) for sub in part.get("parts") or []: _collectAttachmentStubs(sub, acc) stubs: List[Dict[str, Any]] = [] _collectAttachmentStubs(message.get("payload") or {}, stubs) if not stubs: return extractorRegistry = ExtractorRegistry() chunkerRegistry = ChunkerRegistry() for stub in stubs: if stub["size"] and stub["size"] > limits.maxAttachmentBytes: result.skippedPolicy += 1 continue attUrl = ( f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{messageId}" f"/attachments/{stub['attachmentId']}" ) detail = await googleGetFn(attUrl) if not isinstance(detail, dict) or "error" in detail: result.failed += 1 continue rawBytes = _decodeBase64Url(detail.get("data") or "") if not rawBytes: continue fileName = stub["filename"] mimeType = stub["mimeType"] syntheticId = _syntheticAttachmentId(connectionId, messageId, stub["attachmentId"]) try: extracted = runExtraction( extractorRegistry, chunkerRegistry, rawBytes, fileName, mimeType, ExtractionOptions(mergeStrategy=None), ) except Exception as exc: logger.warning("gmail attachment extract %s failed: %s", stub["attachmentId"], exc) result.failed += 1 continue contentObjects: List[Dict[str, Any]] = [] for part in getattr(extracted, "parts", None) or []: data = getattr(part, "data", None) or "" if not data or not str(data).strip(): continue typeGroup = getattr(part, "typeGroup", "text") or "text" contentType = "text" if typeGroup == "image": contentType = "image" elif typeGroup in ("binary", "container"): contentType = "other" contentObjects.append({ "contentObjectId": getattr(part, "id", ""), "contentType": contentType, "data": data, "contextRef": { "containerPath": fileName, "location": getattr(part, "label", None) or "attachment", **(getattr(part, "metadata", None) or {}), }, }) if not contentObjects: result.skippedPolicy += 1 continue try: await knowledgeService.requestIngestion( IngestionJob( sourceKind="gmail_attachment", sourceId=syntheticId, fileName=fileName, mimeType=mimeType, userId=userId, mandateId=mandateId, contentObjects=contentObjects, provenance={ "connectionId": connectionId, "authority": "google", "service": "gmail", "parentId": parentSyntheticId, "externalItemId": stub["attachmentId"], "parentMessageId": messageId, }, ) ) result.attachmentsIndexed += 1 except Exception as exc: logger.warning("gmail attachment ingest %s failed: %s", stub["attachmentId"], exc) result.failed += 1 def _finalizeResult(connectionId: str, result: GmailBootstrapResult, startMs: float) -> Dict[str, Any]: durationMs = int((time.time() - startMs) * 1000) logger.info( "ingestion.connection.bootstrap.done part=gmail connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d", connectionId, result.indexed, result.skippedDuplicate, result.skippedPolicy, result.attachmentsIndexed, result.failed, durationMs, extra={ "event": "ingestion.connection.bootstrap.done", "part": "gmail", "connectionId": connectionId, "indexed": result.indexed, "skippedDup": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "attachmentsIndexed": result.attachmentsIndexed, "failed": result.failed, "durationMs": durationMs, }, ) return { "connectionId": result.connectionId, "indexed": result.indexed, "skippedDuplicate": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "attachmentsIndexed": result.attachmentsIndexed, "failed": result.failed, "durationMs": durationMs, "errors": result.errors[:20], }