gateway/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Gmail bootstrap for the unified knowledge ingestion lane.

Mirrors the Outlook pilot (see subConnectorSyncOutlook.py) but talks to Google
Mail's REST API. Messages become `sourceKind="gmail_message"` virtual documents
with header / snippet / cleaned body content-objects; attachments are optional
child jobs with `sourceKind="gmail_attachment"`.

Idempotency: Gmail's stable `historyId` (or `internalDate` as fallback) is
passed as `contentVersion`, so rerunning the bootstrap yields
`ingestion.skipped.duplicate` for unchanged messages.
"""

from __future__ import annotations

import asyncio
import base64
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional

from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody

logger = logging.getLogger(__name__)

MAX_MESSAGES_DEFAULT = 500
MAX_BODY_CHARS_DEFAULT = 8000
MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
DEFAULT_LABELS = ("INBOX", "SENT")


@dataclass
class GmailBootstrapLimits:
    maxMessages: int = MAX_MESSAGES_DEFAULT
    labels: tuple = DEFAULT_LABELS
    maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
    includeAttachments: bool = False
    maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
    # Only fetch messages newer than N days. None disables filter.
    maxAgeDays: Optional[int] = 90
    # Content depth: "metadata" | "snippet" | "full"
    mailContentDepth: str = "full"
    # Pass-through to IngestionJob.neutralize
    neutralize: bool = False


@dataclass
class GmailBootstrapResult:
    connectionId: str
    indexed: int = 0
    skippedDuplicate: int = 0
    skippedPolicy: int = 0
    failed: int = 0
    attachmentsIndexed: int = 0
    errors: List[str] = field(default_factory=list)


def _syntheticMessageId(connectionId: str, messageId: str) -> str:
    token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
    return f"gm:{connectionId[:8]}:{token}"


def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
    token = hashlib.sha256(
        f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
    ).hexdigest()[:16]
    return f"ga:{connectionId[:8]}:{token}"


def _decodeBase64Url(data: str) -> bytes:
    if not data:
        return b""
    # Gmail uses URL-safe base64 without padding.
    padding = 4 - (len(data) % 4)
    if padding != 4:
        data = data + ("=" * padding)
    try:
        return base64.urlsafe_b64decode(data)
    except Exception:
        return b""


def _walkPayloadForBody(payload: Dict[str, Any]) -> Dict[str, str]:
    """Return {"text": ..., "html": ...} by walking MIME parts.

    Gmail `payload` is a tree of parts. We prefer `text/plain` for the cleaned
    body, but capture `text/html` as a fallback so `cleanEmailBody` can strip
    markup if plain is missing.
    """
    found: Dict[str, str] = {"text": "", "html": ""}

    def _walk(part: Dict[str, Any]) -> None:
        mime = (part.get("mimeType") or "").lower()
        body = part.get("body") or {}
        raw = body.get("data") or ""
        if raw and mime.startswith("text/"):
            decoded = _decodeBase64Url(raw).decode("utf-8", errors="replace")
            key = "text" if mime == "text/plain" else ("html" if mime == "text/html" else "")
            if key and not found[key]:
                found[key] = decoded
        for sub in part.get("parts") or []:
            _walk(sub)

    _walk(payload or {})
    return found


def _headerMap(payload: Dict[str, Any]) -> Dict[str, str]:
    return {
        (h.get("name") or "").lower(): (h.get("value") or "")
        for h in (payload.get("headers") or [])
    }


def _buildContentObjects(
    message: Dict[str, Any],
    maxBodyChars: int,
    mailContentDepth: str = "full",
) -> List[Dict[str, Any]]:
    """Build content objects for a Gmail message.

    `mailContentDepth` controls how much is embedded:
      - "metadata": header only (subject, from, to, date)
      - "snippet":  header + Gmail snippet (~155 chars, no full body)
      - "full":     header + snippet + cleaned full body (default)
    """
    payload = message.get("payload") or {}
    headers = _headerMap(payload)
    subject = headers.get("subject") or "(no subject)"
    fromAddr = headers.get("from") or ""
    toAddr = headers.get("to") or ""
    ccAddr = headers.get("cc") or ""
    date = headers.get("date") or ""
    snippet = message.get("snippet") or ""

    parts: List[Dict[str, Any]] = []
    header = (
        f"Subject: {subject}\n"
        f"From: {fromAddr}\n"
        f"To: {toAddr}\n"
        + (f"Cc: {ccAddr}\n" if ccAddr else "")
        + f"Date: {date}"
    )
    parts.append({
        "contentObjectId": "header",
        "contentType": "text",
        "data": header,
        "contextRef": {"part": "header"},
    })
    if mailContentDepth in ("snippet", "full") and snippet:
        parts.append({
            "contentObjectId": "snippet",
            "contentType": "text",
            "data": snippet,
            "contextRef": {"part": "snippet"},
        })
    if mailContentDepth == "full":
        bodies = _walkPayloadForBody(payload)
        rawBody = bodies["text"] or bodies["html"]
        cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else ""
        if cleanedBody:
            parts.append({
                "contentObjectId": "body",
                "contentType": "text",
                "data": cleanedBody,
                "contextRef": {"part": "body"},
            })
    return parts


async def bootstrapGmail(
    connectionId: str,
    *,
    progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
    adapter: Any = None,
    connection: Any = None,
    knowledgeService: Any = None,
    limits: Optional[GmailBootstrapLimits] = None,
    googleGetFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
    """Enumerate Gmail labels (INBOX + SENT default) and ingest messages."""
    from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
    prefs = loadConnectionPrefs(connectionId)

    if not limits:
        limits = GmailBootstrapLimits(
            includeAttachments=prefs.mailIndexAttachments,
            maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
            mailContentDepth=prefs.mailContentDepth,
            neutralize=prefs.neutralizeBeforeEmbed,
        )

    startMs = time.time()
    result = GmailBootstrapResult(connectionId=connectionId)

    logger.info(
        "ingestion.connection.bootstrap.started part=gmail connectionId=%s",
        connectionId,
        extra={
            "event": "ingestion.connection.bootstrap.started",
            "part": "gmail",
            "connectionId": connectionId,
        },
    )

    if adapter is None or knowledgeService is None or connection is None:
        adapter, connection, knowledgeService = await _resolveDependencies(connectionId)

    if googleGetFn is None:
        from modules.connectors.providerGoogle.connectorGoogle import _googleGet as _defaultGet

        token = getattr(adapter, "_token", "")

        async def googleGetFn(url: str) -> Dict[str, Any]:  # type: ignore[no-redef]
            return await _defaultGet(token, url)

    mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
    userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""

    for labelId in limits.labels:
        if result.indexed + result.skippedDuplicate >= limits.maxMessages:
            break
        try:
            await _ingestLabel(
                googleGetFn=googleGetFn,
                knowledgeService=knowledgeService,
                connectionId=connectionId,
                mandateId=mandateId,
                userId=userId,
                labelId=labelId,
                limits=limits,
                result=result,
                progressCb=progressCb,
            )
        except Exception as exc:
            logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True)
            result.errors.append(f"label({labelId}): {exc}")

    return _finalizeResult(connectionId, result, startMs)


async def _resolveDependencies(connectionId: str):
    from modules.interfaces.interfaceDbApp import getRootInterface
    from modules.auth import TokenManager
    from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
    from modules.serviceCenter import getService
    from modules.serviceCenter.context import ServiceCenterContext
    from modules.security.rootAccess import getRootUser

    rootInterface = getRootInterface()
    connection = rootInterface.getUserConnectionById(connectionId)
    if connection is None:
        raise ValueError(f"UserConnection not found: {connectionId}")

    token = TokenManager().getFreshToken(connectionId)
    if not token or not token.tokenAccess:
        raise ValueError(f"No valid token for connection {connectionId}")

    provider = GoogleConnector(connection, token.tokenAccess)
    adapter = provider.getServiceAdapter("gmail")

    rootUser = getRootUser()
    ctx = ServiceCenterContext(
        user=rootUser,
        mandate_id=str(getattr(connection, "mandateId", "") or ""),
    )
    knowledgeService = getService("knowledge", ctx)
    return adapter, connection, knowledgeService


async def _ingestLabel(
    *,
    googleGetFn,
    knowledgeService,
    connectionId: str,
    mandateId: str,
    userId: str,
    labelId: str,
    limits: GmailBootstrapLimits,
    result: GmailBootstrapResult,
    progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
    remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
    if remaining <= 0:
        return

    pageSize = min(100, remaining)
    query = ""
    if limits.maxAgeDays:
        cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
        # Gmail uses YYYY/MM/DD.
        query = f"after:{cutoff.strftime('%Y/%m/%d')}"

    baseUrl = (
        "https://gmail.googleapis.com/gmail/v1/users/me/messages"
        f"?labelIds={labelId}&maxResults={pageSize}"
    )
    if query:
        baseUrl = f"{baseUrl}&q={query}"

    nextPageToken: Optional[str] = None
    while (result.indexed + result.skippedDuplicate) < limits.maxMessages:
        url = baseUrl if not nextPageToken else f"{baseUrl}&pageToken={nextPageToken}"
        page = await googleGetFn(url)
        if not isinstance(page, dict) or "error" in page:
            err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
            logger.warning("gmail list page error for label %s: %s", labelId, err)
            result.errors.append(f"list({labelId}): {err}")
            return

        messageStubs = page.get("messages") or []
        for stub in messageStubs:
            if result.indexed + result.skippedDuplicate >= limits.maxMessages:
                break
            msgId = stub.get("id")
            if not msgId:
                continue
            detailUrl = (
                f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{msgId}?format=full"
            )
            detail = await googleGetFn(detailUrl)
            if not isinstance(detail, dict) or "error" in detail:
                result.failed += 1
                continue
            await _ingestMessage(
                googleGetFn=googleGetFn,
                knowledgeService=knowledgeService,
                connectionId=connectionId,
                mandateId=mandateId,
                userId=userId,
                labelId=labelId,
                message=detail,
                limits=limits,
                result=result,
                progressCb=progressCb,
            )

        nextPageToken = page.get("nextPageToken")
        if not nextPageToken:
            break


async def _ingestMessage(
    *,
    googleGetFn,
    knowledgeService,
    connectionId: str,
    mandateId: str,
    userId: str,
    labelId: str,
    message: Dict[str, Any],
    limits: GmailBootstrapLimits,
    result: GmailBootstrapResult,
    progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
    from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob

    messageId = message.get("id")
    if not messageId:
        result.skippedPolicy += 1
        return
    revision = message.get("historyId") or message.get("internalDate")
    headers = _headerMap(message.get("payload") or {})
    subject = headers.get("subject") or "(no subject)"
    syntheticId = _syntheticMessageId(connectionId, messageId)
    fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"

    contentObjects = _buildContentObjects(
        message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth
    )
    try:
        handle = await knowledgeService.requestIngestion(
            IngestionJob(
                sourceKind="gmail_message",
                sourceId=syntheticId,
                fileName=fileName,
                mimeType="message/rfc822",
                userId=userId,
                mandateId=mandateId,
                contentObjects=contentObjects,
                contentVersion=str(revision) if revision else None,
                neutralize=limits.neutralize,
                provenance={
                    "connectionId": connectionId,
                    "authority": "google",
                    "service": "gmail",
                    "externalItemId": messageId,
                    "label": labelId,
                    "threadId": message.get("threadId"),
                    "tier": limits.mailContentDepth,
                },
            )
        )
    except Exception as exc:
        logger.error("gmail ingestion %s failed: %s", messageId, exc, exc_info=True)
        result.failed += 1
        result.errors.append(f"ingest({messageId}): {exc}")
        return

    if handle.status == "duplicate":
        result.skippedDuplicate += 1
    elif handle.status == "indexed":
        result.indexed += 1
    else:
        result.failed += 1

    if limits.includeAttachments:
        try:
            await _ingestAttachments(
                googleGetFn=googleGetFn,
                knowledgeService=knowledgeService,
                connectionId=connectionId,
                mandateId=mandateId,
                userId=userId,
                message=message,
                parentSyntheticId=syntheticId,
                limits=limits,
                result=result,
            )
        except Exception as exc:
            logger.warning("gmail attachments %s failed: %s", messageId, exc)
            result.errors.append(f"attachments({messageId}): {exc}")

    if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
        processed = result.indexed + result.skippedDuplicate
        try:
            progressCb(
                min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
                f"gmail processed={processed}",
            )
        except Exception:
            pass
        logger.info(
            "ingestion.connection.bootstrap.progress part=gmail processed=%d skippedDup=%d failed=%d",
            processed, result.skippedDuplicate, result.failed,
            extra={
                "event": "ingestion.connection.bootstrap.progress",
                "part": "gmail",
                "connectionId": connectionId,
                "processed": processed,
                "skippedDup": result.skippedDuplicate,
                "failed": result.failed,
            },
        )

    await asyncio.sleep(0)


async def _ingestAttachments(
    *,
    googleGetFn,
    knowledgeService,
    connectionId: str,
    mandateId: str,
    userId: str,
    message: Dict[str, Any],
    parentSyntheticId: str,
    limits: GmailBootstrapLimits,
    result: GmailBootstrapResult,
) -> None:
    """Child ingestion jobs for file attachments. Skips inline images (cid: refs)."""
    from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
    from modules.datamodels.datamodelExtraction import ExtractionOptions
    from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
    from modules.serviceCenter.services.serviceExtraction.subRegistry import (
        ExtractorRegistry, ChunkerRegistry,
    )

    messageId = message.get("id") or ""

    def _collectAttachmentStubs(part: Dict[str, Any], acc: List[Dict[str, Any]]) -> None:
        filename = part.get("filename") or ""
        body = part.get("body") or {}
        attId = body.get("attachmentId")
        if filename and attId:
            acc.append({
                "filename": filename,
                "mimeType": part.get("mimeType") or "application/octet-stream",
                "attachmentId": attId,
                "size": int(body.get("size") or 0),
            })
        for sub in part.get("parts") or []:
            _collectAttachmentStubs(sub, acc)

    stubs: List[Dict[str, Any]] = []
    _collectAttachmentStubs(message.get("payload") or {}, stubs)
    if not stubs:
        return

    extractorRegistry = ExtractorRegistry()
    chunkerRegistry = ChunkerRegistry()

    for stub in stubs:
        if stub["size"] and stub["size"] > limits.maxAttachmentBytes:
            result.skippedPolicy += 1
            continue
        attUrl = (
            f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{messageId}"
            f"/attachments/{stub['attachmentId']}"
        )
        detail = await googleGetFn(attUrl)
        if not isinstance(detail, dict) or "error" in detail:
            result.failed += 1
            continue
        rawBytes = _decodeBase64Url(detail.get("data") or "")
        if not rawBytes:
            continue
        fileName = stub["filename"]
        mimeType = stub["mimeType"]
        syntheticId = _syntheticAttachmentId(connectionId, messageId, stub["attachmentId"])

        try:
            extracted = runExtraction(
                extractorRegistry, chunkerRegistry,
                rawBytes, fileName, mimeType,
                ExtractionOptions(mergeStrategy=None),
            )
        except Exception as exc:
            logger.warning("gmail attachment extract %s failed: %s", stub["attachmentId"], exc)
            result.failed += 1
            continue

        contentObjects: List[Dict[str, Any]] = []
        for part in getattr(extracted, "parts", None) or []:
            data = getattr(part, "data", None) or ""
            if not data or not str(data).strip():
                continue
            typeGroup = getattr(part, "typeGroup", "text") or "text"
            contentType = "text"
            if typeGroup == "image":
                contentType = "image"
            elif typeGroup in ("binary", "container"):
                contentType = "other"
            contentObjects.append({
                "contentObjectId": getattr(part, "id", ""),
                "contentType": contentType,
                "data": data,
                "contextRef": {
                    "containerPath": fileName,
                    "location": getattr(part, "label", None) or "attachment",
                    **(getattr(part, "metadata", None) or {}),
                },
            })
        if not contentObjects:
            result.skippedPolicy += 1
            continue

        try:
            await knowledgeService.requestIngestion(
                IngestionJob(
                    sourceKind="gmail_attachment",
                    sourceId=syntheticId,
                    fileName=fileName,
                    mimeType=mimeType,
                    userId=userId,
                    mandateId=mandateId,
                    contentObjects=contentObjects,
                    provenance={
                        "connectionId": connectionId,
                        "authority": "google",
                        "service": "gmail",
                        "parentId": parentSyntheticId,
                        "externalItemId": stub["attachmentId"],
                        "parentMessageId": messageId,
                    },
                )
            )
            result.attachmentsIndexed += 1
        except Exception as exc:
            logger.warning("gmail attachment ingest %s failed: %s", stub["attachmentId"], exc)
            result.failed += 1


def _finalizeResult(connectionId: str, result: GmailBootstrapResult, startMs: float) -> Dict[str, Any]:
    durationMs = int((time.time() - startMs) * 1000)
    logger.info(
        "ingestion.connection.bootstrap.done part=gmail connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
        connectionId,
        result.indexed, result.skippedDuplicate, result.skippedPolicy,
        result.attachmentsIndexed, result.failed, durationMs,
        extra={
            "event": "ingestion.connection.bootstrap.done",
            "part": "gmail",
            "connectionId": connectionId,
            "indexed": result.indexed,
            "skippedDup": result.skippedDuplicate,
            "skippedPolicy": result.skippedPolicy,
            "attachmentsIndexed": result.attachmentsIndexed,
            "failed": result.failed,
            "durationMs": durationMs,
        },
    )
    return {
        "connectionId": result.connectionId,
        "indexed": result.indexed,
        "skippedDuplicate": result.skippedDuplicate,
        "skippedPolicy": result.skippedPolicy,
        "attachmentsIndexed": result.attachmentsIndexed,
        "failed": result.failed,
        "durationMs": durationMs,
        "errors": result.errors[:20],
    }