commit before rebase

2026-04-22 11:55:41 +02:00 · 2026-04-22 11:55:41 +02:00 · 3add5c9a80
commit 3add5c9a80
parent 6a5ff1ff7c
8 changed files with 2278 additions and 18 deletions
--- a/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py
+++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py
@ -135,6 +135,15 @@ async def _bootstrapJobHandler(

    progressCb(5, f"resolving {authority} connection")

+    def _normalize(res: Any, label: str) -> Dict[str, Any]:
+        if isinstance(res, Exception):
+            logger.error(
+                "ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s",
+                label, connectionId, res, exc_info=res,
+            )
+            return {"error": str(res)}
+        return res or {}
+
    if authority == "msft":
        from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
            bootstrapSharepoint,
@ -149,16 +158,6 @@ async def _bootstrapJobHandler(
            bootstrapOutlook(connectionId=connectionId, progressCb=progressCb),
            return_exceptions=True,
        )
-
-        def _normalize(res: Any, label: str) -> Dict[str, Any]:
-            if isinstance(res, Exception):
-                logger.error(
-                    "ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s",
-                    label, connectionId, res, exc_info=res,
-                )
-                return {"error": str(res)}
-            return res or {}
-
        return {
            "connectionId": connectionId,
            "authority": authority,
@ -166,21 +165,55 @@ async def _bootstrapJobHandler(
            "outlook": _normalize(olResult, "outlook"),
        }

+    if authority == "google":
+        from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import (
+            bootstrapGdrive,
+        )
+        from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
+            bootstrapGmail,
+        )
+
+        progressCb(10, "drive + gmail")
+        gdResult, gmResult = await asyncio.gather(
+            bootstrapGdrive(connectionId=connectionId, progressCb=progressCb),
+            bootstrapGmail(connectionId=connectionId, progressCb=progressCb),
+            return_exceptions=True,
+        )
+        return {
+            "connectionId": connectionId,
+            "authority": authority,
+            "drive": _normalize(gdResult, "gdrive"),
+            "gmail": _normalize(gmResult, "gmail"),
+        }
+
+    if authority == "clickup":
+        from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
+            bootstrapClickup,
+        )
+
+        progressCb(10, "clickup tasks")
+        cuResult = await bootstrapClickup(connectionId=connectionId, progressCb=progressCb)
+        return {
+            "connectionId": connectionId,
+            "authority": authority,
+            "clickup": _normalize(cuResult, "clickup"),
+        }
+
    logger.info(
-        "ingestion.connection.bootstrap.skipped reason=P1_pilot_scope authority=%s connectionId=%s",
+        "ingestion.connection.bootstrap.skipped reason=unsupported_authority authority=%s connectionId=%s",
        authority, connectionId,
        extra={
            "event": "ingestion.connection.bootstrap.skipped",
            "authority": authority,
            "connectionId": connectionId,
-            "reason": "P1_pilot_scope",
+            "reason": "unsupported_authority",
        },
    )
    return {
        "connectionId": connectionId,
        "authority": authority,
        "skipped": True,
-        "reason": "P1_pilot_scope",
+        "reason": "unsupported_authority",
    }


--- a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py
+++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py
@ -0,0 +1,489 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""ClickUp bootstrap for the unified knowledge ingestion lane.
+
+ClickUp tasks are ingested as *virtual documents* — we never download file
+bytes. Each task becomes a `sourceKind="clickup_task"` IngestionJob whose
+`contentObjects` carry a summary header (name + status + metadata) and the
+task description / text content so retrieval finds them without a live API
+call.
+
+Hierarchy traversal: workspace (team) → spaces → folders / folderless lists →
+tasks. We cap the fan-out with `maxWorkspaces` / `maxListsPerWorkspace` /
+`maxTasks` and skip tasks older than `maxAgeDays` (default 180 d).
+
+Idempotency: `date_updated` from the ClickUp task payload is a millisecond
+timestamp and strictly monotonic per revision — used as `contentVersion`.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta, timezone
+from typing import Any, Callable, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+MAX_TASKS_DEFAULT = 500
+MAX_WORKSPACES_DEFAULT = 3
+MAX_LISTS_PER_WORKSPACE_DEFAULT = 20
+MAX_DESCRIPTION_CHARS_DEFAULT = 8000
+MAX_AGE_DAYS_DEFAULT = 180
+
+
+@dataclass
+class ClickupBootstrapLimits:
+    maxTasks: int = MAX_TASKS_DEFAULT
+    maxWorkspaces: int = MAX_WORKSPACES_DEFAULT
+    maxListsPerWorkspace: int = MAX_LISTS_PER_WORKSPACE_DEFAULT
+    maxDescriptionChars: int = MAX_DESCRIPTION_CHARS_DEFAULT
+    # Only ingest tasks updated within the last N days. None disables filter.
+    maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
+    # Include closed/archived tasks if they still meet the recency filter.
+    # ClickUp `closed` tasks often carry the most useful RAG context
+    # ("why was this shipped the way it was?").
+    includeClosed: bool = True
+
+
+@dataclass
+class ClickupBootstrapResult:
+    connectionId: str
+    indexed: int = 0
+    skippedDuplicate: int = 0
+    skippedPolicy: int = 0
+    failed: int = 0
+    workspaces: int = 0
+    lists: int = 0
+    errors: List[str] = field(default_factory=list)
+
+
+def _syntheticTaskId(connectionId: str, taskId: str) -> str:
+    token = hashlib.sha256(f"{connectionId}:{taskId}".encode("utf-8")).hexdigest()[:16]
+    return f"cu:{connectionId[:8]}:{token}"
+
+
+def _truncate(value: Any, limit: int) -> str:
+    text = str(value or "").strip()
+    if not text:
+        return ""
+    if len(text) <= limit:
+        return text
+    return text[:limit].rstrip() + "\n[truncated]"
+
+
+def _isRecent(dateUpdatedMs: Any, maxAgeDays: Optional[int]) -> bool:
+    if not maxAgeDays:
+        return True
+    if not dateUpdatedMs:
+        return True
+    try:
+        ts = datetime.fromtimestamp(int(dateUpdatedMs) / 1000.0, tz=timezone.utc)
+    except Exception:
+        return True
+    cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
+    return ts >= cutoff
+
+
+def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -> List[Dict[str, Any]]:
+    """Header (name/status/metadata) + description + text_content, all text."""
+    name = task.get("name") or f"Task {task.get('id', '')}"
+    status = ((task.get("status") or {}).get("status")) or ""
+    assignees = ", ".join(
+        filter(None, [
+            (a.get("username") or a.get("email") or "")
+            for a in (task.get("assignees") or [])
+        ])
+    )
+    tags = ", ".join(filter(None, [t.get("name", "") for t in (task.get("tags") or [])]))
+    listInfo = task.get("list") or {}
+    folderInfo = task.get("folder") or {}
+    spaceInfo = task.get("space") or {}
+    dueMs = task.get("due_date")
+    dueIso = ""
+    if dueMs:
+        try:
+            dueIso = datetime.fromtimestamp(int(dueMs) / 1000.0, tz=timezone.utc).strftime("%Y-%m-%d")
+        except Exception:
+            dueIso = ""
+
+    headerLines = [
+        f"Task: {name}",
+        f"Status: {status}" if status else "",
+        f"List: {listInfo.get('name', '')}" if listInfo else "",
+        f"Folder: {folderInfo.get('name', '')}" if folderInfo else "",
+        f"Space: {spaceInfo.get('name', '')}" if spaceInfo else "",
+        f"Assignees: {assignees}" if assignees else "",
+        f"Tags: {tags}" if tags else "",
+        f"Due: {dueIso}" if dueIso else "",
+        f"Url: {task.get('url', '')}" if task.get("url") else "",
+    ]
+    header = "\n".join(line for line in headerLines if line)
+
+    parts: List[Dict[str, Any]] = [{
+        "contentObjectId": "header",
+        "contentType": "text",
+        "data": header,
+        "contextRef": {"part": "header"},
+    }]
+
+    description = _truncate(task.get("description"), limits.maxDescriptionChars)
+    if description:
+        parts.append({
+            "contentObjectId": "description",
+            "contentType": "text",
+            "data": description,
+            "contextRef": {"part": "description"},
+        })
+    # text_content is ClickUp's rendered-markdown version; include if it adds
+    # something beyond the plain description (common for bullet lists, checklists).
+    textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars)
+    if textContent and textContent != description:
+        parts.append({
+            "contentObjectId": "text_content",
+            "contentType": "text",
+            "data": textContent,
+            "contextRef": {"part": "text_content"},
+        })
+    return parts
+
+
+async def bootstrapClickup(
+    connectionId: str,
+    *,
+    progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
+    adapter: Any = None,
+    connection: Any = None,
+    knowledgeService: Any = None,
+    limits: Optional[ClickupBootstrapLimits] = None,
+) -> Dict[str, Any]:
+    """Walk workspaces → lists → tasks and ingest each task as a virtual doc."""
+    limits = limits or ClickupBootstrapLimits()
+    startMs = time.time()
+    result = ClickupBootstrapResult(connectionId=connectionId)
+
+    logger.info(
+        "ingestion.connection.bootstrap.started part=clickup connectionId=%s",
+        connectionId,
+        extra={
+            "event": "ingestion.connection.bootstrap.started",
+            "part": "clickup",
+            "connectionId": connectionId,
+        },
+    )
+
+    if adapter is None or knowledgeService is None or connection is None:
+        adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
+
+    mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
+    userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
+
+    svc = getattr(adapter, "_svc", None)
+    if svc is None:
+        result.errors.append("adapter missing _svc instance")
+        return _finalizeResult(connectionId, result, startMs)
+
+    try:
+        teamsResp = await svc.getAuthorizedTeams()
+    except Exception as exc:
+        logger.error("clickup team discovery failed for %s: %s", connectionId, exc, exc_info=True)
+        result.errors.append(f"teams: {exc}")
+        return _finalizeResult(connectionId, result, startMs)
+
+    teams = (teamsResp or {}).get("teams") or []
+    for team in teams[: limits.maxWorkspaces]:
+        if result.indexed + result.skippedDuplicate >= limits.maxTasks:
+            break
+        teamId = str(team.get("id", "") or "")
+        if not teamId:
+            continue
+        result.workspaces += 1
+        try:
+            await _walkTeam(
+                svc=svc,
+                knowledgeService=knowledgeService,
+                connectionId=connectionId,
+                mandateId=mandateId,
+                userId=userId,
+                team=team,
+                limits=limits,
+                result=result,
+                progressCb=progressCb,
+            )
+        except Exception as exc:
+            logger.error("clickup team %s walk failed: %s", teamId, exc, exc_info=True)
+            result.errors.append(f"team({teamId}): {exc}")
+
+    return _finalizeResult(connectionId, result, startMs)
+
+
+async def _resolveDependencies(connectionId: str):
+    from modules.interfaces.interfaceDbApp import getRootInterface
+    from modules.auth import TokenManager
+    from modules.connectors.providerClickup.connectorClickup import ClickupConnector
+    from modules.serviceCenter import getService
+    from modules.serviceCenter.context import ServiceCenterContext
+    from modules.security.rootAccess import getRootUser
+
+    rootInterface = getRootInterface()
+    connection = rootInterface.getUserConnectionById(connectionId)
+    if connection is None:
+        raise ValueError(f"UserConnection not found: {connectionId}")
+
+    token = TokenManager().getFreshToken(connectionId)
+    if not token or not token.tokenAccess:
+        raise ValueError(f"No valid token for connection {connectionId}")
+
+    provider = ClickupConnector(connection, token.tokenAccess)
+    adapter = provider.getServiceAdapter("clickup")
+
+    rootUser = getRootUser()
+    ctx = ServiceCenterContext(
+        user=rootUser,
+        mandate_id=str(getattr(connection, "mandateId", "") or ""),
+    )
+    knowledgeService = getService("knowledge", ctx)
+    return adapter, connection, knowledgeService
+
+
+async def _walkTeam(
+    *,
+    svc,
+    knowledgeService,
+    connectionId: str,
+    mandateId: str,
+    userId: str,
+    team: Dict[str, Any],
+    limits: ClickupBootstrapLimits,
+    result: ClickupBootstrapResult,
+    progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+    teamId = str(team.get("id", "") or "")
+    spacesResp = await svc.getSpaces(teamId)
+    spaces = (spacesResp or {}).get("spaces") or []
+
+    listsCollected: List[Dict[str, Any]] = []
+    for space in spaces:
+        if len(listsCollected) >= limits.maxListsPerWorkspace:
+            break
+        spaceId = str(space.get("id", "") or "")
+        if not spaceId:
+            continue
+
+        # Folderless lists directly under the space
+        folderless = await svc.getFolderlessLists(spaceId)
+        for lst in (folderless or {}).get("lists") or []:
+            if len(listsCollected) >= limits.maxListsPerWorkspace:
+                break
+            listsCollected.append({**lst, "_space": space})
+
+        # Lists inside folders
+        foldersResp = await svc.getFolders(spaceId)
+        for folder in (foldersResp or {}).get("folders") or []:
+            if len(listsCollected) >= limits.maxListsPerWorkspace:
+                break
+            folderId = str(folder.get("id", "") or "")
+            if not folderId:
+                continue
+            folderLists = await svc.getListsInFolder(folderId)
+            for lst in (folderLists or {}).get("lists") or []:
+                if len(listsCollected) >= limits.maxListsPerWorkspace:
+                    break
+                listsCollected.append({**lst, "_space": space, "_folder": folder})
+
+    for lst in listsCollected:
+        if result.indexed + result.skippedDuplicate >= limits.maxTasks:
+            return
+        result.lists += 1
+        await _walkList(
+            svc=svc,
+            knowledgeService=knowledgeService,
+            connectionId=connectionId,
+            mandateId=mandateId,
+            userId=userId,
+            teamId=teamId,
+            lst=lst,
+            limits=limits,
+            result=result,
+            progressCb=progressCb,
+        )
+
+
+async def _walkList(
+    *,
+    svc,
+    knowledgeService,
+    connectionId: str,
+    mandateId: str,
+    userId: str,
+    teamId: str,
+    lst: Dict[str, Any],
+    limits: ClickupBootstrapLimits,
+    result: ClickupBootstrapResult,
+    progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+    listId = str(lst.get("id", "") or "")
+    if not listId:
+        return
+    page = 0
+    while result.indexed + result.skippedDuplicate < limits.maxTasks:
+        resp = await svc.getTasksInList(
+            listId,
+            page=page,
+            include_closed=limits.includeClosed,
+            subtasks=True,
+        )
+        if isinstance(resp, dict) and resp.get("error"):
+            logger.warning("clickup tasks list=%s page=%d error: %s", listId, page, resp.get("error"))
+            result.errors.append(f"list({listId}): {resp.get('error')}")
+            return
+        tasks = (resp or {}).get("tasks") or []
+        if not tasks:
+            return
+
+        for task in tasks:
+            if result.indexed + result.skippedDuplicate >= limits.maxTasks:
+                return
+            if not _isRecent(task.get("date_updated"), limits.maxAgeDays):
+                result.skippedPolicy += 1
+                continue
+            # Inject the list/folder/space metadata we already loaded.
+            task["list"] = task.get("list") or {"id": listId, "name": lst.get("name")}
+            task["folder"] = task.get("folder") or lst.get("_folder") or {}
+            task["space"] = task.get("space") or lst.get("_space") or {}
+            await _ingestTask(
+                knowledgeService=knowledgeService,
+                connectionId=connectionId,
+                mandateId=mandateId,
+                userId=userId,
+                teamId=teamId,
+                task=task,
+                limits=limits,
+                result=result,
+                progressCb=progressCb,
+            )
+
+        if len(tasks) < 100:  # ClickUp page-size hint: fewer than 100 => last page
+            return
+        page += 1
+
+
+async def _ingestTask(
+    *,
+    knowledgeService,
+    connectionId: str,
+    mandateId: str,
+    userId: str,
+    teamId: str,
+    task: Dict[str, Any],
+    limits: ClickupBootstrapLimits,
+    result: ClickupBootstrapResult,
+    progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+    from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+    taskId = str(task.get("id", "") or "")
+    if not taskId:
+        result.skippedPolicy += 1
+        return
+    revision = str(task.get("date_updated") or task.get("date_created") or "")
+    name = task.get("name") or f"Task {taskId}"
+    syntheticId = _syntheticTaskId(connectionId, taskId)
+    fileName = f"{name[:80].strip() or taskId}.task.json"
+
+    contentObjects = _buildContentObjects(task, limits)
+
+    try:
+        handle = await knowledgeService.requestIngestion(
+            IngestionJob(
+                sourceKind="clickup_task",
+                sourceId=syntheticId,
+                fileName=fileName,
+                mimeType="application/vnd.clickup.task+json",
+                userId=userId,
+                mandateId=mandateId,
+                contentObjects=contentObjects,
+                contentVersion=revision or None,
+                provenance={
+                    "connectionId": connectionId,
+                    "authority": "clickup",
+                    "service": "clickup",
+                    "externalItemId": taskId,
+                    "teamId": teamId,
+                    "listId": ((task.get("list") or {}).get("id")),
+                    "spaceId": ((task.get("space") or {}).get("id")),
+                    "url": task.get("url"),
+                    "status": ((task.get("status") or {}).get("status")),
+                    "tier": "body",
+                },
+            )
+        )
+    except Exception as exc:
+        logger.error("clickup ingestion %s failed: %s", taskId, exc, exc_info=True)
+        result.failed += 1
+        result.errors.append(f"ingest({taskId}): {exc}")
+        return
+
+    if handle.status == "duplicate":
+        result.skippedDuplicate += 1
+    elif handle.status == "indexed":
+        result.indexed += 1
+    else:
+        result.failed += 1
+
+    if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
+        processed = result.indexed + result.skippedDuplicate
+        try:
+            progressCb(
+                min(90, 10 + int(80 * processed / max(1, limits.maxTasks))),
+                f"clickup processed={processed}",
+            )
+        except Exception:
+            pass
+        logger.info(
+            "ingestion.connection.bootstrap.progress part=clickup processed=%d skippedDup=%d failed=%d",
+            processed, result.skippedDuplicate, result.failed,
+            extra={
+                "event": "ingestion.connection.bootstrap.progress",
+                "part": "clickup",
+                "connectionId": connectionId,
+                "processed": processed,
+                "skippedDup": result.skippedDuplicate,
+                "failed": result.failed,
+            },
+        )
+
+
+def _finalizeResult(connectionId: str, result: ClickupBootstrapResult, startMs: float) -> Dict[str, Any]:
+    durationMs = int((time.time() - startMs) * 1000)
+    logger.info(
+        "ingestion.connection.bootstrap.done part=clickup connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d workspaces=%d lists=%d durationMs=%d",
+        connectionId,
+        result.indexed, result.skippedDuplicate, result.skippedPolicy,
+        result.failed, result.workspaces, result.lists, durationMs,
+        extra={
+            "event": "ingestion.connection.bootstrap.done",
+            "part": "clickup",
+            "connectionId": connectionId,
+            "indexed": result.indexed,
+            "skippedDup": result.skippedDuplicate,
+            "skippedPolicy": result.skippedPolicy,
+            "failed": result.failed,
+            "workspaces": result.workspaces,
+            "lists": result.lists,
+            "durationMs": durationMs,
+        },
+    )
+    return {
+        "connectionId": result.connectionId,
+        "indexed": result.indexed,
+        "skippedDuplicate": result.skippedDuplicate,
+        "skippedPolicy": result.skippedPolicy,
+        "failed": result.failed,
+        "workspaces": result.workspaces,
+        "lists": result.lists,
+        "durationMs": durationMs,
+        "errors": result.errors[:20],
+    }
--- a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py
+++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py
@ -0,0 +1,429 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Google Drive bootstrap for the unified knowledge ingestion lane.
+
+Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the
+user's *My Drive* tree from the virtual `root` folder, downloads each
+file-like item via `DriveAdapter.download` (which handles native Google docs
+via export), runs the standard extraction pipeline and routes results through
+`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and
+`contentVersion = modifiedTime` (monotonic per-revision).
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta, timezone
+from typing import Any, Callable, Dict, List, Optional
+
+from modules.datamodels.datamodelExtraction import ExtractionOptions
+
+logger = logging.getLogger(__name__)
+
+MAX_ITEMS_DEFAULT = 500
+MAX_BYTES_DEFAULT = 200 * 1024 * 1024
+MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
+SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
+MAX_DEPTH_DEFAULT = 4
+MAX_AGE_DAYS_DEFAULT = 365
+
+# Google Drive uses virtual mime-types for folders and non-downloadable assets.
+FOLDER_MIME = "application/vnd.google-apps.folder"
+
+
+@dataclass
+class GdriveBootstrapLimits:
+    maxItems: int = MAX_ITEMS_DEFAULT
+    maxBytes: int = MAX_BYTES_DEFAULT
+    maxFileSize: int = MAX_FILE_SIZE_DEFAULT
+    skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
+    maxDepth: int = MAX_DEPTH_DEFAULT
+    # Only ingest files modified within the last N days. None disables filter.
+    maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
+
+
+@dataclass
+class GdriveBootstrapResult:
+    connectionId: str
+    indexed: int = 0
+    skippedDuplicate: int = 0
+    skippedPolicy: int = 0
+    failed: int = 0
+    bytesProcessed: int = 0
+    errors: List[str] = field(default_factory=list)
+
+
+def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
+    token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
+    return f"gd:{connectionId[:8]}:{token}"
+
+
+def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
+    parts = getattr(extracted, "parts", None) or []
+    out: List[Dict[str, Any]] = []
+    for part in parts:
+        data = getattr(part, "data", None) or ""
+        if not data or not str(data).strip():
+            continue
+        typeGroup = getattr(part, "typeGroup", "text") or "text"
+        contentType = "text"
+        if typeGroup == "image":
+            contentType = "image"
+        elif typeGroup in ("binary", "container"):
+            contentType = "other"
+        out.append({
+            "contentObjectId": getattr(part, "id", ""),
+            "contentType": contentType,
+            "data": data,
+            "contextRef": {
+                "containerPath": fileName,
+                "location": getattr(part, "label", None) or "file",
+                **(getattr(part, "metadata", None) or {}),
+            },
+        })
+    return out
+
+
+def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
+    if not maxAgeDays:
+        return True
+    if not modifiedIso:
+        # No timestamp -> be permissive (Drive native docs sometimes omit it on export).
+        return True
+    try:
+        # Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both.
+        ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00"))
+    except Exception:
+        return True
+    cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
+    if ts.tzinfo is None:
+        ts = ts.replace(tzinfo=timezone.utc)
+    return ts >= cutoff
+
+
+async def bootstrapGdrive(
+    connectionId: str,
+    *,
+    progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
+    adapter: Any = None,
+    connection: Any = None,
+    knowledgeService: Any = None,
+    limits: Optional[GdriveBootstrapLimits] = None,
+    runExtractionFn: Optional[Callable[..., Any]] = None,
+) -> Dict[str, Any]:
+    """Walk My Drive starting from the virtual root folder."""
+    limits = limits or GdriveBootstrapLimits()
+    startMs = time.time()
+    result = GdriveBootstrapResult(connectionId=connectionId)
+
+    logger.info(
+        "ingestion.connection.bootstrap.started part=gdrive connectionId=%s",
+        connectionId,
+        extra={
+            "event": "ingestion.connection.bootstrap.started",
+            "part": "gdrive",
+            "connectionId": connectionId,
+        },
+    )
+
+    if adapter is None or knowledgeService is None or connection is None:
+        adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
+    if runExtractionFn is None:
+        from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
+        from modules.serviceCenter.services.serviceExtraction.subRegistry import (
+            ExtractorRegistry, ChunkerRegistry,
+        )
+        extractorRegistry = ExtractorRegistry()
+        chunkerRegistry = ChunkerRegistry()
+
+        def runExtractionFn(bytesData, name, mime, options):  # type: ignore[no-redef]
+            return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
+
+    mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
+    userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
+
+    try:
+        await _walkFolder(
+            adapter=adapter,
+            knowledgeService=knowledgeService,
+            runExtractionFn=runExtractionFn,
+            connectionId=connectionId,
+            mandateId=mandateId,
+            userId=userId,
+            folderPath="/",  # DriveAdapter.browse maps "" / "/" -> "root"
+            depth=0,
+            limits=limits,
+            result=result,
+            progressCb=progressCb,
+        )
+    except Exception as exc:
+        logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True)
+        result.errors.append(f"walk: {exc}")
+
+    return _finalizeResult(connectionId, result, startMs)
+
+
+async def _resolveDependencies(connectionId: str):
+    from modules.interfaces.interfaceDbApp import getRootInterface
+    from modules.auth import TokenManager
+    from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
+    from modules.serviceCenter import getService
+    from modules.serviceCenter.context import ServiceCenterContext
+    from modules.security.rootAccess import getRootUser
+
+    rootInterface = getRootInterface()
+    connection = rootInterface.getUserConnectionById(connectionId)
+    if connection is None:
+        raise ValueError(f"UserConnection not found: {connectionId}")
+
+    token = TokenManager().getFreshToken(connectionId)
+    if not token or not token.tokenAccess:
+        raise ValueError(f"No valid token for connection {connectionId}")
+
+    provider = GoogleConnector(connection, token.tokenAccess)
+    adapter = provider.getServiceAdapter("drive")
+
+    rootUser = getRootUser()
+    ctx = ServiceCenterContext(
+        user=rootUser,
+        mandate_id=str(getattr(connection, "mandateId", "") or ""),
+    )
+    knowledgeService = getService("knowledge", ctx)
+    return adapter, connection, knowledgeService
+
+
+async def _walkFolder(
+    *,
+    adapter,
+    knowledgeService,
+    runExtractionFn,
+    connectionId: str,
+    mandateId: str,
+    userId: str,
+    folderPath: str,
+    depth: int,
+    limits: GdriveBootstrapLimits,
+    result: GdriveBootstrapResult,
+    progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+    if depth > limits.maxDepth:
+        return
+    try:
+        entries = await adapter.browse(folderPath)
+    except Exception as exc:
+        logger.warning("gdrive browse %s failed: %s", folderPath, exc)
+        result.errors.append(f"browse({folderPath}): {exc}")
+        return
+
+    for entry in entries:
+        if result.indexed + result.skippedDuplicate >= limits.maxItems:
+            return
+        if result.bytesProcessed >= limits.maxBytes:
+            return
+
+        entryPath = getattr(entry, "path", "") or ""
+        metadata = getattr(entry, "metadata", {}) or {}
+        mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType")
+
+        if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME:
+            await _walkFolder(
+                adapter=adapter,
+                knowledgeService=knowledgeService,
+                runExtractionFn=runExtractionFn,
+                connectionId=connectionId,
+                mandateId=mandateId,
+                userId=userId,
+                folderPath=entryPath,
+                depth=depth + 1,
+                limits=limits,
+                result=result,
+                progressCb=progressCb,
+            )
+            continue
+
+        effectiveMime = mimeType or "application/octet-stream"
+        if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes):
+            result.skippedPolicy += 1
+            continue
+        size = int(getattr(entry, "size", 0) or 0)
+        if size and size > limits.maxFileSize:
+            result.skippedPolicy += 1
+            continue
+        modifiedTime = metadata.get("modifiedTime")
+        if not _isRecent(modifiedTime, limits.maxAgeDays):
+            result.skippedPolicy += 1
+            continue
+
+        externalItemId = metadata.get("id") or entryPath
+        revision = modifiedTime
+
+        await _ingestOne(
+            adapter=adapter,
+            knowledgeService=knowledgeService,
+            runExtractionFn=runExtractionFn,
+            connectionId=connectionId,
+            mandateId=mandateId,
+            userId=userId,
+            entry=entry,
+            entryPath=entryPath,
+            mimeType=effectiveMime,
+            externalItemId=externalItemId,
+            revision=revision,
+            limits=limits,
+            result=result,
+            progressCb=progressCb,
+        )
+
+
+async def _ingestOne(
+    *,
+    adapter,
+    knowledgeService,
+    runExtractionFn,
+    connectionId: str,
+    mandateId: str,
+    userId: str,
+    entry,
+    entryPath: str,
+    mimeType: str,
+    externalItemId: str,
+    revision: Optional[str],
+    limits: GdriveBootstrapLimits,
+    result: GdriveBootstrapResult,
+    progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+    from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+    syntheticFileId = _syntheticFileId(connectionId, externalItemId)
+    fileName = getattr(entry, "name", "") or externalItemId
+
+    try:
+        downloaded = await adapter.download(entryPath)
+    except Exception as exc:
+        logger.warning("gdrive download %s failed: %s", entryPath, exc)
+        result.failed += 1
+        result.errors.append(f"download({entryPath}): {exc}")
+        return
+
+    # Adapter.download returns raw bytes today; guard DownloadResult shape too.
+    fileBytes: bytes
+    if isinstance(downloaded, (bytes, bytearray)):
+        fileBytes = bytes(downloaded)
+    else:
+        fileBytes = bytes(getattr(downloaded, "data", b"") or b"")
+        if getattr(downloaded, "mimeType", None):
+            mimeType = downloaded.mimeType  # export may have changed the type
+    if not fileBytes:
+        result.failed += 1
+        return
+    if len(fileBytes) > limits.maxFileSize:
+        result.skippedPolicy += 1
+        return
+
+    result.bytesProcessed += len(fileBytes)
+
+    try:
+        extracted = runExtractionFn(
+            fileBytes, fileName, mimeType,
+            ExtractionOptions(mergeStrategy=None),
+        )
+    except Exception as exc:
+        logger.warning("gdrive extraction %s failed: %s", entryPath, exc)
+        result.failed += 1
+        result.errors.append(f"extract({entryPath}): {exc}")
+        return
+
+    contentObjects = _toContentObjects(extracted, fileName)
+    if not contentObjects:
+        result.skippedPolicy += 1
+        return
+
+    try:
+        handle = await knowledgeService.requestIngestion(
+            IngestionJob(
+                sourceKind="gdrive_item",
+                sourceId=syntheticFileId,
+                fileName=fileName,
+                mimeType=mimeType,
+                userId=userId,
+                mandateId=mandateId,
+                contentObjects=contentObjects,
+                contentVersion=revision,
+                provenance={
+                    "connectionId": connectionId,
+                    "authority": "google",
+                    "service": "drive",
+                    "externalItemId": externalItemId,
+                    "entryPath": entryPath,
+                    "tier": "body",
+                },
+            )
+        )
+    except Exception as exc:
+        logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True)
+        result.failed += 1
+        result.errors.append(f"ingest({entryPath}): {exc}")
+        return
+
+    if handle.status == "duplicate":
+        result.skippedDuplicate += 1
+    elif handle.status == "indexed":
+        result.indexed += 1
+    else:
+        result.failed += 1
+
+    if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
+        processed = result.indexed + result.skippedDuplicate
+        try:
+            progressCb(
+                min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
+                f"gdrive processed={processed}",
+            )
+        except Exception:
+            pass
+        logger.info(
+            "ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d",
+            processed, result.skippedDuplicate, result.failed,
+            extra={
+                "event": "ingestion.connection.bootstrap.progress",
+                "part": "gdrive",
+                "connectionId": connectionId,
+                "processed": processed,
+                "skippedDup": result.skippedDuplicate,
+                "failed": result.failed,
+            },
+        )
+
+
+def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]:
+    durationMs = int((time.time() - startMs) * 1000)
+    logger.info(
+        "ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d",
+        connectionId,
+        result.indexed, result.skippedDuplicate, result.skippedPolicy,
+        result.failed, result.bytesProcessed, durationMs,
+        extra={
+            "event": "ingestion.connection.bootstrap.done",
+            "part": "gdrive",
+            "connectionId": connectionId,
+            "indexed": result.indexed,
+            "skippedDup": result.skippedDuplicate,
+            "skippedPolicy": result.skippedPolicy,
+            "failed": result.failed,
+            "bytes": result.bytesProcessed,
+            "durationMs": durationMs,
+        },
+    )
+    return {
+        "connectionId": result.connectionId,
+        "indexed": result.indexed,
+        "skippedDuplicate": result.skippedDuplicate,
+        "skippedPolicy": result.skippedPolicy,
+        "failed": result.failed,
+        "bytesProcessed": result.bytesProcessed,
+        "durationMs": durationMs,
+        "errors": result.errors[:20],
+    }
--- a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py
+++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py
@ -0,0 +1,578 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Gmail bootstrap for the unified knowledge ingestion lane.
+
+Mirrors the Outlook pilot (see subConnectorSyncOutlook.py) but talks to Google
+Mail's REST API. Messages become `sourceKind="gmail_message"` virtual documents
+with header / snippet / cleaned body content-objects; attachments are optional
+child jobs with `sourceKind="gmail_attachment"`.
+
+Idempotency: Gmail's stable `historyId` (or `internalDate` as fallback) is
+passed as `contentVersion`, so rerunning the bootstrap yields
+`ingestion.skipped.duplicate` for unchanged messages.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import base64
+import hashlib
+import logging
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta, timezone
+from typing import Any, Callable, Dict, List, Optional
+
+from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
+
+logger = logging.getLogger(__name__)
+
+MAX_MESSAGES_DEFAULT = 500
+MAX_BODY_CHARS_DEFAULT = 8000
+MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
+DEFAULT_LABELS = ("INBOX", "SENT")
+
+
+@dataclass
+class GmailBootstrapLimits:
+    maxMessages: int = MAX_MESSAGES_DEFAULT
+    labels: tuple = DEFAULT_LABELS
+    maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
+    includeAttachments: bool = False
+    maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
+    # Only fetch messages newer than N days. None disables filter.
+    maxAgeDays: Optional[int] = 90
+
+
+@dataclass
+class GmailBootstrapResult:
+    connectionId: str
+    indexed: int = 0
+    skippedDuplicate: int = 0
+    skippedPolicy: int = 0
+    failed: int = 0
+    attachmentsIndexed: int = 0
+    errors: List[str] = field(default_factory=list)
+
+
+def _syntheticMessageId(connectionId: str, messageId: str) -> str:
+    token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
+    return f"gm:{connectionId[:8]}:{token}"
+
+
+def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
+    token = hashlib.sha256(
+        f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
+    ).hexdigest()[:16]
+    return f"ga:{connectionId[:8]}:{token}"
+
+
+def _decodeBase64Url(data: str) -> bytes:
+    if not data:
+        return b""
+    # Gmail uses URL-safe base64 without padding.
+    padding = 4 - (len(data) % 4)
+    if padding != 4:
+        data = data + ("=" * padding)
+    try:
+        return base64.urlsafe_b64decode(data)
+    except Exception:
+        return b""
+
+
+def _walkPayloadForBody(payload: Dict[str, Any]) -> Dict[str, str]:
+    """Return {"text": ..., "html": ...} by walking MIME parts.
+
+    Gmail `payload` is a tree of parts. We prefer `text/plain` for the cleaned
+    body, but capture `text/html` as a fallback so `cleanEmailBody` can strip
+    markup if plain is missing.
+    """
+    found: Dict[str, str] = {"text": "", "html": ""}
+
+    def _walk(part: Dict[str, Any]) -> None:
+        mime = (part.get("mimeType") or "").lower()
+        body = part.get("body") or {}
+        raw = body.get("data") or ""
+        if raw and mime.startswith("text/"):
+            decoded = _decodeBase64Url(raw).decode("utf-8", errors="replace")
+            key = "text" if mime == "text/plain" else ("html" if mime == "text/html" else "")
+            if key and not found[key]:
+                found[key] = decoded
+        for sub in part.get("parts") or []:
+            _walk(sub)
+
+    _walk(payload or {})
+    return found
+
+
+def _headerMap(payload: Dict[str, Any]) -> Dict[str, str]:
+    return {
+        (h.get("name") or "").lower(): (h.get("value") or "")
+        for h in (payload.get("headers") or [])
+    }
+
+
+def _buildContentObjects(message: Dict[str, Any], maxBodyChars: int) -> List[Dict[str, Any]]:
+    payload = message.get("payload") or {}
+    headers = _headerMap(payload)
+    subject = headers.get("subject") or "(no subject)"
+    fromAddr = headers.get("from") or ""
+    toAddr = headers.get("to") or ""
+    ccAddr = headers.get("cc") or ""
+    date = headers.get("date") or ""
+    snippet = message.get("snippet") or ""
+
+    bodies = _walkPayloadForBody(payload)
+    rawBody = bodies["text"] or bodies["html"]
+    cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else ""
+
+    parts: List[Dict[str, Any]] = []
+    header = (
+        f"Subject: {subject}\n"
+        f"From: {fromAddr}\n"
+        f"To: {toAddr}\n"
+        + (f"Cc: {ccAddr}\n" if ccAddr else "")
+        + f"Date: {date}"
+    )
+    parts.append({
+        "contentObjectId": "header",
+        "contentType": "text",
+        "data": header,
+        "contextRef": {"part": "header"},
+    })
+    if snippet:
+        parts.append({
+            "contentObjectId": "snippet",
+            "contentType": "text",
+            "data": snippet,
+            "contextRef": {"part": "snippet"},
+        })
+    if cleanedBody:
+        parts.append({
+            "contentObjectId": "body",
+            "contentType": "text",
+            "data": cleanedBody,
+            "contextRef": {"part": "body"},
+        })
+    return parts
+
+
+async def bootstrapGmail(
+    connectionId: str,
+    *,
+    progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
+    adapter: Any = None,
+    connection: Any = None,
+    knowledgeService: Any = None,
+    limits: Optional[GmailBootstrapLimits] = None,
+    googleGetFn: Optional[Callable[..., Any]] = None,
+) -> Dict[str, Any]:
+    """Enumerate Gmail labels (INBOX + SENT default) and ingest messages."""
+    limits = limits or GmailBootstrapLimits()
+    startMs = time.time()
+    result = GmailBootstrapResult(connectionId=connectionId)
+
+    logger.info(
+        "ingestion.connection.bootstrap.started part=gmail connectionId=%s",
+        connectionId,
+        extra={
+            "event": "ingestion.connection.bootstrap.started",
+            "part": "gmail",
+            "connectionId": connectionId,
+        },
+    )
+
+    if adapter is None or knowledgeService is None or connection is None:
+        adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
+
+    if googleGetFn is None:
+        from modules.connectors.providerGoogle.connectorGoogle import _googleGet as _defaultGet
+
+        token = getattr(adapter, "_token", "")
+
+        async def googleGetFn(url: str) -> Dict[str, Any]:  # type: ignore[no-redef]
+            return await _defaultGet(token, url)
+
+    mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
+    userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
+
+    for labelId in limits.labels:
+        if result.indexed + result.skippedDuplicate >= limits.maxMessages:
+            break
+        try:
+            await _ingestLabel(
+                googleGetFn=googleGetFn,
+                knowledgeService=knowledgeService,
+                connectionId=connectionId,
+                mandateId=mandateId,
+                userId=userId,
+                labelId=labelId,
+                limits=limits,
+                result=result,
+                progressCb=progressCb,
+            )
+        except Exception as exc:
+            logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True)
+            result.errors.append(f"label({labelId}): {exc}")
+
+    return _finalizeResult(connectionId, result, startMs)
+
+
+async def _resolveDependencies(connectionId: str):
+    from modules.interfaces.interfaceDbApp import getRootInterface
+    from modules.auth import TokenManager
+    from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
+    from modules.serviceCenter import getService
+    from modules.serviceCenter.context import ServiceCenterContext
+    from modules.security.rootAccess import getRootUser
+
+    rootInterface = getRootInterface()
+    connection = rootInterface.getUserConnectionById(connectionId)
+    if connection is None:
+        raise ValueError(f"UserConnection not found: {connectionId}")
+
+    token = TokenManager().getFreshToken(connectionId)
+    if not token or not token.tokenAccess:
+        raise ValueError(f"No valid token for connection {connectionId}")
+
+    provider = GoogleConnector(connection, token.tokenAccess)
+    adapter = provider.getServiceAdapter("gmail")
+
+    rootUser = getRootUser()
+    ctx = ServiceCenterContext(
+        user=rootUser,
+        mandate_id=str(getattr(connection, "mandateId", "") or ""),
+    )
+    knowledgeService = getService("knowledge", ctx)
+    return adapter, connection, knowledgeService
+
+
+async def _ingestLabel(
+    *,
+    googleGetFn,
+    knowledgeService,
+    connectionId: str,
+    mandateId: str,
+    userId: str,
+    labelId: str,
+    limits: GmailBootstrapLimits,
+    result: GmailBootstrapResult,
+    progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+    remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
+    if remaining <= 0:
+        return
+
+    pageSize = min(100, remaining)
+    query = ""
+    if limits.maxAgeDays:
+        cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
+        # Gmail uses YYYY/MM/DD.
+        query = f"after:{cutoff.strftime('%Y/%m/%d')}"
+
+    baseUrl = (
+        "https://gmail.googleapis.com/gmail/v1/users/me/messages"
+        f"?labelIds={labelId}&maxResults={pageSize}"
+    )
+    if query:
+        baseUrl = f"{baseUrl}&q={query}"
+
+    nextPageToken: Optional[str] = None
+    while (result.indexed + result.skippedDuplicate) < limits.maxMessages:
+        url = baseUrl if not nextPageToken else f"{baseUrl}&pageToken={nextPageToken}"
+        page = await googleGetFn(url)
+        if not isinstance(page, dict) or "error" in page:
+            err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
+            logger.warning("gmail list page error for label %s: %s", labelId, err)
+            result.errors.append(f"list({labelId}): {err}")
+            return
+
+        messageStubs = page.get("messages") or []
+        for stub in messageStubs:
+            if result.indexed + result.skippedDuplicate >= limits.maxMessages:
+                break
+            msgId = stub.get("id")
+            if not msgId:
+                continue
+            detailUrl = (
+                f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{msgId}?format=full"
+            )
+            detail = await googleGetFn(detailUrl)
+            if not isinstance(detail, dict) or "error" in detail:
+                result.failed += 1
+                continue
+            await _ingestMessage(
+                googleGetFn=googleGetFn,
+                knowledgeService=knowledgeService,
+                connectionId=connectionId,
+                mandateId=mandateId,
+                userId=userId,
+                labelId=labelId,
+                message=detail,
+                limits=limits,
+                result=result,
+                progressCb=progressCb,
+            )
+
+        nextPageToken = page.get("nextPageToken")
+        if not nextPageToken:
+            break
+
+
+async def _ingestMessage(
+    *,
+    googleGetFn,
+    knowledgeService,
+    connectionId: str,
+    mandateId: str,
+    userId: str,
+    labelId: str,
+    message: Dict[str, Any],
+    limits: GmailBootstrapLimits,
+    result: GmailBootstrapResult,
+    progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+    from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+    messageId = message.get("id")
+    if not messageId:
+        result.skippedPolicy += 1
+        return
+    revision = message.get("historyId") or message.get("internalDate")
+    headers = _headerMap(message.get("payload") or {})
+    subject = headers.get("subject") or "(no subject)"
+    syntheticId = _syntheticMessageId(connectionId, messageId)
+    fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"
+
+    contentObjects = _buildContentObjects(message, limits.maxBodyChars)
+    try:
+        handle = await knowledgeService.requestIngestion(
+            IngestionJob(
+                sourceKind="gmail_message",
+                sourceId=syntheticId,
+                fileName=fileName,
+                mimeType="message/rfc822",
+                userId=userId,
+                mandateId=mandateId,
+                contentObjects=contentObjects,
+                contentVersion=str(revision) if revision else None,
+                provenance={
+                    "connectionId": connectionId,
+                    "authority": "google",
+                    "service": "gmail",
+                    "externalItemId": messageId,
+                    "label": labelId,
+                    "threadId": message.get("threadId"),
+                    "tier": "body",
+                },
+            )
+        )
+    except Exception as exc:
+        logger.error("gmail ingestion %s failed: %s", messageId, exc, exc_info=True)
+        result.failed += 1
+        result.errors.append(f"ingest({messageId}): {exc}")
+        return
+
+    if handle.status == "duplicate":
+        result.skippedDuplicate += 1
+    elif handle.status == "indexed":
+        result.indexed += 1
+    else:
+        result.failed += 1
+
+    if limits.includeAttachments:
+        try:
+            await _ingestAttachments(
+                googleGetFn=googleGetFn,
+                knowledgeService=knowledgeService,
+                connectionId=connectionId,
+                mandateId=mandateId,
+                userId=userId,
+                message=message,
+                parentSyntheticId=syntheticId,
+                limits=limits,
+                result=result,
+            )
+        except Exception as exc:
+            logger.warning("gmail attachments %s failed: %s", messageId, exc)
+            result.errors.append(f"attachments({messageId}): {exc}")
+
+    if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
+        processed = result.indexed + result.skippedDuplicate
+        try:
+            progressCb(
+                min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
+                f"gmail processed={processed}",
+            )
+        except Exception:
+            pass
+        logger.info(
+            "ingestion.connection.bootstrap.progress part=gmail processed=%d skippedDup=%d failed=%d",
+            processed, result.skippedDuplicate, result.failed,
+            extra={
+                "event": "ingestion.connection.bootstrap.progress",
+                "part": "gmail",
+                "connectionId": connectionId,
+                "processed": processed,
+                "skippedDup": result.skippedDuplicate,
+                "failed": result.failed,
+            },
+        )
+
+    await asyncio.sleep(0)
+
+
+async def _ingestAttachments(
+    *,
+    googleGetFn,
+    knowledgeService,
+    connectionId: str,
+    mandateId: str,
+    userId: str,
+    message: Dict[str, Any],
+    parentSyntheticId: str,
+    limits: GmailBootstrapLimits,
+    result: GmailBootstrapResult,
+) -> None:
+    """Child ingestion jobs for file attachments. Skips inline images (cid: refs)."""
+    from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+    from modules.datamodels.datamodelExtraction import ExtractionOptions
+    from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
+    from modules.serviceCenter.services.serviceExtraction.subRegistry import (
+        ExtractorRegistry, ChunkerRegistry,
+    )
+
+    messageId = message.get("id") or ""
+
+    def _collectAttachmentStubs(part: Dict[str, Any], acc: List[Dict[str, Any]]) -> None:
+        filename = part.get("filename") or ""
+        body = part.get("body") or {}
+        attId = body.get("attachmentId")
+        if filename and attId:
+            acc.append({
+                "filename": filename,
+                "mimeType": part.get("mimeType") or "application/octet-stream",
+                "attachmentId": attId,
+                "size": int(body.get("size") or 0),
+            })
+        for sub in part.get("parts") or []:
+            _collectAttachmentStubs(sub, acc)
+
+    stubs: List[Dict[str, Any]] = []
+    _collectAttachmentStubs(message.get("payload") or {}, stubs)
+    if not stubs:
+        return
+
+    extractorRegistry = ExtractorRegistry()
+    chunkerRegistry = ChunkerRegistry()
+
+    for stub in stubs:
+        if stub["size"] and stub["size"] > limits.maxAttachmentBytes:
+            result.skippedPolicy += 1
+            continue
+        attUrl = (
+            f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{messageId}"
+            f"/attachments/{stub['attachmentId']}"
+        )
+        detail = await googleGetFn(attUrl)
+        if not isinstance(detail, dict) or "error" in detail:
+            result.failed += 1
+            continue
+        rawBytes = _decodeBase64Url(detail.get("data") or "")
+        if not rawBytes:
+            continue
+        fileName = stub["filename"]
+        mimeType = stub["mimeType"]
+        syntheticId = _syntheticAttachmentId(connectionId, messageId, stub["attachmentId"])
+
+        try:
+            extracted = runExtraction(
+                extractorRegistry, chunkerRegistry,
+                rawBytes, fileName, mimeType,
+                ExtractionOptions(mergeStrategy=None),
+            )
+        except Exception as exc:
+            logger.warning("gmail attachment extract %s failed: %s", stub["attachmentId"], exc)
+            result.failed += 1
+            continue
+
+        contentObjects: List[Dict[str, Any]] = []
+        for part in getattr(extracted, "parts", None) or []:
+            data = getattr(part, "data", None) or ""
+            if not data or not str(data).strip():
+                continue
+            typeGroup = getattr(part, "typeGroup", "text") or "text"
+            contentType = "text"
+            if typeGroup == "image":
+                contentType = "image"
+            elif typeGroup in ("binary", "container"):
+                contentType = "other"
+            contentObjects.append({
+                "contentObjectId": getattr(part, "id", ""),
+                "contentType": contentType,
+                "data": data,
+                "contextRef": {
+                    "containerPath": fileName,
+                    "location": getattr(part, "label", None) or "attachment",
+                    **(getattr(part, "metadata", None) or {}),
+                },
+            })
+        if not contentObjects:
+            result.skippedPolicy += 1
+            continue
+
+        try:
+            await knowledgeService.requestIngestion(
+                IngestionJob(
+                    sourceKind="gmail_attachment",
+                    sourceId=syntheticId,
+                    fileName=fileName,
+                    mimeType=mimeType,
+                    userId=userId,
+                    mandateId=mandateId,
+                    contentObjects=contentObjects,
+                    provenance={
+                        "connectionId": connectionId,
+                        "authority": "google",
+                        "service": "gmail",
+                        "parentId": parentSyntheticId,
+                        "externalItemId": stub["attachmentId"],
+                        "parentMessageId": messageId,
+                    },
+                )
+            )
+            result.attachmentsIndexed += 1
+        except Exception as exc:
+            logger.warning("gmail attachment ingest %s failed: %s", stub["attachmentId"], exc)
+            result.failed += 1
+
+
+def _finalizeResult(connectionId: str, result: GmailBootstrapResult, startMs: float) -> Dict[str, Any]:
+    durationMs = int((time.time() - startMs) * 1000)
+    logger.info(
+        "ingestion.connection.bootstrap.done part=gmail connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
+        connectionId,
+        result.indexed, result.skippedDuplicate, result.skippedPolicy,
+        result.attachmentsIndexed, result.failed, durationMs,
+        extra={
+            "event": "ingestion.connection.bootstrap.done",
+            "part": "gmail",
+            "connectionId": connectionId,
+            "indexed": result.indexed,
+            "skippedDup": result.skippedDuplicate,
+            "skippedPolicy": result.skippedPolicy,
+            "attachmentsIndexed": result.attachmentsIndexed,
+            "failed": result.failed,
+            "durationMs": durationMs,
+        },
+    )
+    return {
+        "connectionId": result.connectionId,
+        "indexed": result.indexed,
+        "skippedDuplicate": result.skippedDuplicate,
+        "skippedPolicy": result.skippedPolicy,
+        "attachmentsIndexed": result.attachmentsIndexed,
+        "failed": result.failed,
+        "durationMs": durationMs,
+        "errors": result.errors[:20],
+    }
--- a/tests/unit/services/test_bootstrap_clickup.py
+++ b/tests/unit/services/test_bootstrap_clickup.py
@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Bootstrap ClickUp tests with a fake service + knowledge service.
+
+Verifies:
+- Teams → spaces → lists (folderless + folder-based) → tasks traversal.
+- Each task produces a `requestIngestion` call with `sourceKind="clickup_task"`
+  and header + description content-objects.
+- `date_updated` is forwarded as contentVersion → idempotency.
+- Recency filter drops tasks older than `maxAgeDays`.
+- maxWorkspaces / maxListsPerWorkspace / maxTasks caps are respected.
+"""
+
+import asyncio
+import os
+import sys
+import time
+from types import SimpleNamespace
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
+    bootstrapClickup,
+    ClickupBootstrapLimits,
+    _syntheticTaskId,
+)
+
+
+def _nowMs(offsetDays: int = 0) -> str:
+    return str(int((time.time() + offsetDays * 86400) * 1000))
+
+
+class _FakeClickupService:
+    """Records API calls; serves a canned 1-team / 1-space / 1-list / 2-task layout."""
+
+    def __init__(self, taskCount=2, oldTask=False):
+        self._taskCount = taskCount
+        self._oldTask = oldTask  # when True, the second task is 400 days old
+        self.calls = []
+
+    async def getAuthorizedTeams(self):
+        self.calls.append(("getAuthorizedTeams",))
+        return {"teams": [{"id": "team-1", "name": "Acme"}]}
+
+    async def getSpaces(self, team_id: str):
+        self.calls.append(("getSpaces", team_id))
+        return {"spaces": [{"id": "space-1", "name": "Engineering"}]}
+
+    async def getFolderlessLists(self, space_id: str):
+        self.calls.append(("getFolderlessLists", space_id))
+        return {"lists": [{"id": "list-1", "name": "Sprint 1"}]}
+
+    async def getFolders(self, space_id: str):
+        self.calls.append(("getFolders", space_id))
+        return {"folders": [{"id": "folder-1", "name": "Subproject"}]}
+
+    async def getListsInFolder(self, folder_id: str):
+        self.calls.append(("getListsInFolder", folder_id))
+        return {"lists": [{"id": "list-2", "name": "Sub-tasks"}]}
+
+    async def getTasksInList(self, list_id: str, *, page=0, include_closed=False, subtasks=True):
+        self.calls.append(("getTasksInList", list_id, page, include_closed))
+        if page > 0:
+            return {"tasks": []}
+        tasks = []
+        for i in range(self._taskCount):
+            tid = f"{list_id}-task-{i}"
+            offsetDays = -400 if (self._oldTask and i == 1) else 0
+            tasks.append({
+                "id": tid,
+                "name": f"Task {i} of {list_id}",
+                "description": f"Plain description for task {i}",
+                "text_content": f"Rich content for task {i}",
+                "status": {"status": "open" if i == 0 else "closed"},
+                "assignees": [{"username": "alice"}],
+                "tags": [{"name": "urgent"}],
+                "date_updated": _nowMs(offsetDays),
+                "date_created": _nowMs(-1),
+                "url": f"https://app.clickup.com/t/{tid}",
+            })
+        return {"tasks": tasks}
+
+
+class _FakeKnowledgeService:
+    def __init__(self, duplicateIds=None):
+        self.calls = []
+        self._duplicates = duplicateIds or set()
+
+    async def requestIngestion(self, job):
+        self.calls.append(job)
+        status = "duplicate" if job.sourceId in self._duplicates else "indexed"
+        return SimpleNamespace(
+            jobId=job.sourceId, status=status, contentHash="h",
+            fileId=job.sourceId, index=None, error=None,
+        )
+
+
+def _adapter(svc):
+    return SimpleNamespace(_svc=svc)
+
+
+def test_bootstrap_walks_team_space_lists_and_tasks():
+    svc = _FakeClickupService(taskCount=2)
+    knowledge = _FakeKnowledgeService()
+    connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+    async def _run():
+        return await bootstrapClickup(
+            connectionId="c1",
+            adapter=_adapter(svc),
+            connection=connection,
+            knowledgeService=knowledge,
+            limits=ClickupBootstrapLimits(maxAgeDays=None),
+        )
+
+    result = asyncio.run(_run())
+    # 2 lists (folderless list-1 + folder's list-2) × 2 tasks each = 4 tasks
+    assert result["indexed"] == 4
+    assert result["workspaces"] == 1
+    assert result["lists"] == 2
+    sourceIds = {c.sourceId for c in knowledge.calls}
+    assert len(sourceIds) == 4
+    for job in knowledge.calls:
+        assert job.sourceKind == "clickup_task"
+        assert job.mimeType == "application/vnd.clickup.task+json"
+        assert job.mandateId == "m1"
+        assert job.provenance["connectionId"] == "c1"
+        assert job.provenance["authority"] == "clickup"
+        assert job.provenance["teamId"] == "team-1"
+        assert job.contentVersion  # numeric millisecond string
+        # At least the header content-object is present.
+        ids = [co["contentObjectId"] for co in job.contentObjects]
+        assert "header" in ids
+
+
+def test_bootstrap_reports_duplicates_on_second_run():
+    svc = _FakeClickupService(taskCount=1)
+    duplicates = {
+        _syntheticTaskId("c1", "list-1-task-0"),
+        _syntheticTaskId("c1", "list-2-task-0"),
+    }
+    knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
+    connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+    async def _run():
+        return await bootstrapClickup(
+            connectionId="c1",
+            adapter=_adapter(svc),
+            connection=connection,
+            knowledgeService=knowledge,
+            limits=ClickupBootstrapLimits(maxAgeDays=None),
+        )
+
+    result = asyncio.run(_run())
+    assert result["indexed"] == 0
+    assert result["skippedDuplicate"] == 2
+
+
+def test_bootstrap_skips_tasks_older_than_maxAgeDays():
+    svc = _FakeClickupService(taskCount=2, oldTask=True)
+    knowledge = _FakeKnowledgeService()
+    connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+    async def _run():
+        return await bootstrapClickup(
+            connectionId="c1",
+            adapter=_adapter(svc),
+            connection=connection,
+            knowledgeService=knowledge,
+            limits=ClickupBootstrapLimits(maxAgeDays=180),
+        )
+
+    result = asyncio.run(_run())
+    # 2 lists × (1 recent + 1 skipped old) = 2 indexed + 2 skippedPolicy
+    assert result["indexed"] == 2
+    assert result["skippedPolicy"] == 2
+
+
+def test_bootstrap_maxTasks_caps_ingestion():
+    svc = _FakeClickupService(taskCount=2)
+    knowledge = _FakeKnowledgeService()
+    connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+    async def _run():
+        return await bootstrapClickup(
+            connectionId="c1",
+            adapter=_adapter(svc),
+            connection=connection,
+            knowledgeService=knowledge,
+            limits=ClickupBootstrapLimits(maxAgeDays=None, maxTasks=3),
+        )
+
+    result = asyncio.run(_run())
+    assert result["indexed"] == 3
+
+
+if __name__ == "__main__":
+    test_bootstrap_walks_team_space_lists_and_tasks()
+    test_bootstrap_reports_duplicates_on_second_run()
+    test_bootstrap_skips_tasks_older_than_maxAgeDays()
+    test_bootstrap_maxTasks_caps_ingestion()
+    print("OK — bootstrapClickup tests passed")
--- a/tests/unit/services/test_bootstrap_gdrive.py
+++ b/tests/unit/services/test_bootstrap_gdrive.py
@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Bootstrap Google Drive tests with a fake adapter + knowledge service.
+
+Verifies:
+- Drive walk traverses root → subfolders, respecting `maxDepth`.
+- Every file triggers `requestIngestion` with `sourceKind="gdrive_item"`.
+- Duplicate runs (same modifiedTime revision) report `skippedDuplicate`.
+- Provenance carries `authority="google"` and the Drive file id.
+- Recency filter skips files older than `maxAgeDays`.
+"""
+
+import asyncio
+import os
+import sys
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from types import SimpleNamespace
+from typing import Any, Dict, List, Optional
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import (
+    bootstrapGdrive,
+    GdriveBootstrapLimits,
+    _syntheticFileId,
+)
+
+
+@dataclass
+class _ExtEntry:
+    name: str
+    path: str
+    isFolder: bool = False
+    size: Optional[int] = None
+    mimeType: Optional[str] = None
+    metadata: Dict[str, Any] = None
+
+
+def _today_iso(offsetDays: int = 0) -> str:
+    return (datetime.now(timezone.utc) + timedelta(days=offsetDays)).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+class _FakeDriveAdapter:
+    """Minimal DriveAdapter stand-in.
+
+    Layout:
+        "/" (root) → 2 files + 1 folder (sub)
+        "/sub_id" → 1 file
+    """
+
+    def __init__(self, recent_only: bool = True):
+        self.downloaded: List[str] = []
+        self._recent = _today_iso(0)
+        self._old = _today_iso(-400)
+        self._recent_only = recent_only
+
+    async def browse(self, path: str, filter=None, limit=None):
+        if path in ("/", "", "root"):
+            return [
+                _ExtEntry(
+                    name="f1.txt", path="/f1", size=20,
+                    mimeType="text/plain",
+                    metadata={"id": "f1", "modifiedTime": self._recent},
+                ),
+                _ExtEntry(
+                    name="f2.txt", path="/f2", size=20,
+                    mimeType="text/plain",
+                    metadata={"id": "f2", "modifiedTime": self._recent if self._recent_only else self._old},
+                ),
+                _ExtEntry(
+                    name="Subfolder", path="/sub_id", isFolder=True,
+                    mimeType="application/vnd.google-apps.folder",
+                    metadata={"id": "sub_id", "modifiedTime": self._recent},
+                ),
+            ]
+        if path == "/sub_id":
+            return [
+                _ExtEntry(
+                    name="f3.txt", path="/f3", size=20,
+                    mimeType="text/plain",
+                    metadata={"id": "f3", "modifiedTime": self._recent},
+                ),
+            ]
+        return []
+
+    async def download(self, path: str) -> bytes:
+        self.downloaded.append(path)
+        return path.encode("utf-8")
+
+
+class _FakeKnowledgeService:
+    def __init__(self, duplicateIds=None):
+        self.calls: List[SimpleNamespace] = []
+        self._duplicateIds = duplicateIds or set()
+
+    async def requestIngestion(self, job):
+        self.calls.append(job)
+        status = "duplicate" if job.sourceId in self._duplicateIds else "indexed"
+        return SimpleNamespace(
+            jobId=f"{job.sourceKind}:{job.sourceId}",
+            status=status, contentHash="h",
+            fileId=job.sourceId, index=None, error=None,
+        )
+
+
+def _fakeRunExtraction(data, name, mime, options):
+    return SimpleNamespace(
+        parts=[
+            SimpleNamespace(
+                id="p1",
+                data=data.decode("utf-8") if isinstance(data, bytes) else str(data),
+                typeGroup="text",
+                label="page:1",
+                metadata={"pageIndex": 0},
+            )
+        ]
+    )
+
+
+def test_bootstrap_walks_drive_and_subfolders():
+    adapter = _FakeDriveAdapter()
+    knowledge = _FakeKnowledgeService()
+    connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+    async def _run():
+        return await bootstrapGdrive(
+            connectionId="c1",
+            adapter=adapter,
+            connection=connection,
+            knowledgeService=knowledge,
+            runExtractionFn=_fakeRunExtraction,
+            limits=GdriveBootstrapLimits(maxAgeDays=None),
+        )
+
+    result = asyncio.run(_run())
+    assert len(knowledge.calls) == 3
+    sourceIds = {c.sourceId for c in knowledge.calls}
+    assert sourceIds == {
+        _syntheticFileId("c1", "f1"),
+        _syntheticFileId("c1", "f2"),
+        _syntheticFileId("c1", "f3"),
+    }
+    assert result["indexed"] == 3
+    assert result["skippedDuplicate"] == 0
+    assert adapter.downloaded == ["/f1", "/f2", "/f3"]
+
+
+def test_bootstrap_reports_duplicates_on_second_run():
+    adapter = _FakeDriveAdapter()
+    duplicateIds = {
+        _syntheticFileId("c1", "f1"),
+        _syntheticFileId("c1", "f2"),
+        _syntheticFileId("c1", "f3"),
+    }
+    knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds)
+    connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+    async def _run():
+        return await bootstrapGdrive(
+            connectionId="c1",
+            adapter=adapter,
+            connection=connection,
+            knowledgeService=knowledge,
+            runExtractionFn=_fakeRunExtraction,
+            limits=GdriveBootstrapLimits(maxAgeDays=None),
+        )
+
+    result = asyncio.run(_run())
+    assert result["indexed"] == 0
+    assert result["skippedDuplicate"] == 3
+
+
+def test_bootstrap_skips_files_older_than_maxAgeDays():
+    adapter = _FakeDriveAdapter(recent_only=False)  # f2 is 400 days old
+    knowledge = _FakeKnowledgeService()
+    connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+    async def _run():
+        return await bootstrapGdrive(
+            connectionId="c1",
+            adapter=adapter,
+            connection=connection,
+            knowledgeService=knowledge,
+            runExtractionFn=_fakeRunExtraction,
+            limits=GdriveBootstrapLimits(maxAgeDays=180),
+        )
+
+    result = asyncio.run(_run())
+    assert result["indexed"] == 2  # f1, f3
+    assert result["skippedPolicy"] == 1  # f2 filtered out
+
+
+def test_bootstrap_passes_connection_provenance():
+    adapter = _FakeDriveAdapter()
+    knowledge = _FakeKnowledgeService()
+    connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+    async def _run():
+        return await bootstrapGdrive(
+            connectionId="c1",
+            adapter=adapter,
+            connection=connection,
+            knowledgeService=knowledge,
+            runExtractionFn=_fakeRunExtraction,
+            limits=GdriveBootstrapLimits(maxAgeDays=None),
+        )
+
+    asyncio.run(_run())
+    for job in knowledge.calls:
+        assert job.sourceKind == "gdrive_item"
+        assert job.mandateId == "m1"
+        assert job.provenance["connectionId"] == "c1"
+        assert job.provenance["authority"] == "google"
+        assert job.provenance["service"] == "drive"
+        assert job.contentVersion  # modifiedTime ISO string
+
+
+if __name__ == "__main__":
+    test_bootstrap_walks_drive_and_subfolders()
+    test_bootstrap_reports_duplicates_on_second_run()
+    test_bootstrap_skips_files_older_than_maxAgeDays()
+    test_bootstrap_passes_connection_provenance()
+    print("OK — bootstrapGdrive tests passed")
--- a/tests/unit/services/test_bootstrap_gmail.py
+++ b/tests/unit/services/test_bootstrap_gmail.py
@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Bootstrap Gmail tests with a fake googleGet + knowledge service.
+
+Verifies:
+- Default labels (INBOX + SENT) are traversed.
+- Each message produces a requestIngestion call with sourceKind=gmail_message
+  and structured contentObjects (header / snippet / body).
+- Pagination via `nextPageToken` is followed.
+- historyId is forwarded as contentVersion → idempotency.
+- MIME body extraction walks nested parts (multipart/alternative).
+"""
+
+import asyncio
+import base64
+import os
+import sys
+from types import SimpleNamespace
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
+    bootstrapGmail,
+    GmailBootstrapLimits,
+    _syntheticMessageId,
+    _buildContentObjects,
+    _walkPayloadForBody,
+)
+
+
+def _b64url(text: str) -> str:
+    return base64.urlsafe_b64encode(text.encode("utf-8")).decode("ascii").rstrip("=")
+
+
+def _msg(mid: str, subject: str = "Hi", body: str = "Hello world", historyId: str = "h1"):
+    return {
+        "id": mid,
+        "threadId": f"thread-{mid}",
+        "historyId": historyId,
+        "internalDate": "1700000000000",
+        "snippet": body[:120],
+        "payload": {
+            "headers": [
+                {"name": "Subject", "value": subject},
+                {"name": "From", "value": "Alice <a@x.com>"},
+                {"name": "To", "value": "Bob <b@x.com>"},
+                {"name": "Date", "value": "Tue, 01 Jan 2025 10:00:00 +0000"},
+            ],
+            "mimeType": "text/plain",
+            "body": {"data": _b64url(body), "size": len(body)},
+            "parts": [],
+        },
+    }
+
+
+class _FakeGoogleGet:
+    """Records URLs + returns the wired-up page or message response."""
+
+    def __init__(self, messages_by_label, paginated_label=None, page2=None):
+        self._messages = messages_by_label
+        self._paginated = paginated_label
+        self._page2 = page2 or []
+        self._served_first_page = set()
+        self.requested = []
+
+    async def __call__(self, url: str):
+        self.requested.append(url)
+        # List page: contains `/users/me/messages?labelIds=...`
+        if "/users/me/messages?" in url:
+            for label, msgs in self._messages.items():
+                if f"labelIds={label}" in url:
+                    if (
+                        label == self._paginated
+                        and label not in self._served_first_page
+                    ):
+                        self._served_first_page.add(label)
+                        return {
+                            "messages": [{"id": m["id"]} for m in msgs],
+                            "nextPageToken": "token-2",
+                        }
+                    if label == self._paginated and "pageToken=token-2" in url:
+                        return {
+                            "messages": [{"id": m["id"]} for m in self._page2],
+                        }
+                    return {"messages": [{"id": m["id"]} for m in msgs]}
+            return {"messages": []}
+        # Detail fetch: /users/me/messages/{id}?format=full
+        if "/users/me/messages/" in url and "format=full" in url:
+            msgId = url.split("/users/me/messages/")[-1].split("?")[0]
+            for msgs in self._messages.values():
+                for m in msgs:
+                    if m["id"] == msgId:
+                        return m
+            for m in self._page2:
+                if m["id"] == msgId:
+                    return m
+        return {"error": "not found"}
+
+
+class _FakeKnowledgeService:
+    def __init__(self, duplicateIds=None):
+        self.calls = []
+        self._duplicates = duplicateIds or set()
+
+    async def requestIngestion(self, job):
+        self.calls.append(job)
+        status = "duplicate" if job.sourceId in self._duplicates else "indexed"
+        return SimpleNamespace(
+            jobId=job.sourceId, status=status, contentHash="h",
+            fileId=job.sourceId, index=None, error=None,
+        )
+
+
+def test_buildContentObjects_emits_header_snippet_body():
+    parts = _buildContentObjects(_msg("m1", body="Hello\nWorld"), maxBodyChars=8000)
+    ids = [p["contentObjectId"] for p in parts]
+    assert ids == ["header", "snippet", "body"]
+    header = parts[0]["data"]
+    assert "Subject: Hi" in header
+    assert "From: Alice <a@x.com>" in header
+    assert "To: Bob <b@x.com>" in header
+
+
+def test_walkPayloadForBody_prefers_plain_over_html():
+    payload = {
+        "mimeType": "multipart/alternative",
+        "parts": [
+            {"mimeType": "text/plain", "body": {"data": _b64url("plain body")}},
+            {"mimeType": "text/html", "body": {"data": _b64url("<p>html body</p>")}},
+        ],
+    }
+    bodies = _walkPayloadForBody(payload)
+    assert bodies["text"] == "plain body"
+    assert bodies["html"] == "<p>html body</p>"
+
+
+def test_walkPayloadForBody_falls_back_to_html():
+    payload = {
+        "mimeType": "multipart/alternative",
+        "parts": [
+            {"mimeType": "text/html", "body": {"data": _b64url("<p>only html</p>")}},
+        ],
+    }
+    bodies = _walkPayloadForBody(payload)
+    assert bodies["text"] == ""
+    assert "only html" in bodies["html"]
+
+
+def test_bootstrap_gmail_indexes_messages_from_inbox_and_sent():
+    fake_get = _FakeGoogleGet({
+        "INBOX": [_msg("m1"), _msg("m2")],
+        "SENT": [_msg("m3")],
+    })
+    knowledge = _FakeKnowledgeService()
+    connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+    async def _run():
+        return await bootstrapGmail(
+            connectionId="c1",
+            adapter=SimpleNamespace(_token="t"),
+            connection=connection,
+            knowledgeService=knowledge,
+            limits=GmailBootstrapLimits(maxAgeDays=None),
+            googleGetFn=fake_get,
+        )
+
+    result = asyncio.run(_run())
+    assert result["indexed"] == 3
+    sourceIds = {c.sourceId for c in knowledge.calls}
+    assert sourceIds == {
+        _syntheticMessageId("c1", "m1"),
+        _syntheticMessageId("c1", "m2"),
+        _syntheticMessageId("c1", "m3"),
+    }
+    for job in knowledge.calls:
+        assert job.sourceKind == "gmail_message"
+        assert job.mimeType == "message/rfc822"
+        assert job.provenance["connectionId"] == "c1"
+        assert job.provenance["authority"] == "google"
+        assert job.provenance["service"] == "gmail"
+        assert job.contentVersion == "h1"
+        assert any(co["contentObjectId"] == "header" for co in job.contentObjects)
+
+
+def test_bootstrap_gmail_follows_pagination():
+    fake_get = _FakeGoogleGet(
+        messages_by_label={"INBOX": [_msg("m1")], "SENT": []},
+        paginated_label="INBOX",
+        page2=[_msg("m2"), _msg("m3")],
+    )
+    knowledge = _FakeKnowledgeService()
+    connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+    async def _run():
+        return await bootstrapGmail(
+            connectionId="c1",
+            adapter=SimpleNamespace(_token="t"),
+            connection=connection,
+            knowledgeService=knowledge,
+            limits=GmailBootstrapLimits(maxAgeDays=None),
+            googleGetFn=fake_get,
+        )
+
+    result = asyncio.run(_run())
+    assert result["indexed"] == 3
+
+
+def test_bootstrap_gmail_reports_duplicates():
+    fake_get = _FakeGoogleGet({"INBOX": [_msg("m1"), _msg("m2")], "SENT": []})
+    duplicates = {
+        _syntheticMessageId("c1", "m1"),
+        _syntheticMessageId("c1", "m2"),
+    }
+    knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
+    connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+    async def _run():
+        return await bootstrapGmail(
+            connectionId="c1",
+            adapter=SimpleNamespace(_token="t"),
+            connection=connection,
+            knowledgeService=knowledge,
+            limits=GmailBootstrapLimits(maxAgeDays=None),
+            googleGetFn=fake_get,
+        )
+
+    result = asyncio.run(_run())
+    assert result["indexed"] == 0
+    assert result["skippedDuplicate"] == 2
+
+
+if __name__ == "__main__":
+    test_buildContentObjects_emits_header_snippet_body()
+    test_walkPayloadForBody_prefers_plain_over_html()
+    test_walkPayloadForBody_falls_back_to_html()
+    test_bootstrap_gmail_indexes_messages_from_inbox_and_sent()
+    test_bootstrap_gmail_follows_pagination()
+    test_bootstrap_gmail_reports_duplicates()
+    print("OK — bootstrapGmail tests passed")
--- a/tests/unit/services/test_knowledge_ingest_consumer.py
+++ b/tests/unit/services/test_knowledge_ingest_consumer.py
@ -99,17 +99,18 @@ def test_onConnectionRevoked_ignores_missing_id(monkeypatch):
    assert seen == []


-def test_bootstrap_job_skips_non_pilot_authority(monkeypatch):
+def test_bootstrap_job_skips_unsupported_authority(monkeypatch):
    async def _run():
        result = await consumer._bootstrapJobHandler(
-            {"payload": {"connectionId": "c1", "authority": "google"}},
+            {"payload": {"connectionId": "c1", "authority": "slack"}},
            lambda *_: None,
        )
        return result

    result = asyncio.run(_run())
    assert result["skipped"] is True
-    assert result["authority"] == "google"
+    assert result["authority"] == "slack"
+    assert result["reason"] == "unsupported_authority"


 def test_bootstrap_job_dispatches_msft_parts(monkeypatch):
@ -123,8 +124,6 @@ def test_bootstrap_job_dispatches_msft_parts(monkeypatch):
        calls["ol"] += 1
        return {"indexed": 2}

-    # subConnectorSync* are lazy-imported inside the handler; install fake
-    # modules before invoking.
    fakeSharepoint = types.ModuleType("subConnectorSyncSharepoint")
    fakeSharepoint.bootstrapSharepoint = _fakeSp
    fakeOutlook = types.ModuleType("subConnectorSyncOutlook")
@ -152,6 +151,70 @@ def test_bootstrap_job_dispatches_msft_parts(monkeypatch):
    assert result["outlook"] == {"indexed": 2}


+def test_bootstrap_job_dispatches_google_parts(monkeypatch):
+    calls = {"gd": 0, "gm": 0}
+
+    async def _fakeGd(connectionId, progressCb=None):
+        calls["gd"] += 1
+        return {"indexed": 7}
+
+    async def _fakeGm(connectionId, progressCb=None):
+        calls["gm"] += 1
+        return {"indexed": 11}
+
+    fakeGdrive = types.ModuleType("subConnectorSyncGdrive")
+    fakeGdrive.bootstrapGdrive = _fakeGd
+    fakeGmail = types.ModuleType("subConnectorSyncGmail")
+    fakeGmail.bootstrapGmail = _fakeGm
+    monkeypatch.setitem(
+        sys.modules,
+        "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive",
+        fakeGdrive,
+    )
+    monkeypatch.setitem(
+        sys.modules,
+        "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail",
+        fakeGmail,
+    )
+
+    async def _run():
+        return await consumer._bootstrapJobHandler(
+            {"payload": {"connectionId": "c1", "authority": "google"}},
+            lambda *_: None,
+        )
+
+    result = asyncio.run(_run())
+    assert calls == {"gd": 1, "gm": 1}
+    assert result["drive"] == {"indexed": 7}
+    assert result["gmail"] == {"indexed": 11}
+
+
+def test_bootstrap_job_dispatches_clickup_part(monkeypatch):
+    calls = {"cu": 0}
+
+    async def _fakeCu(connectionId, progressCb=None):
+        calls["cu"] += 1
+        return {"indexed": 4}
+
+    fakeClickup = types.ModuleType("subConnectorSyncClickup")
+    fakeClickup.bootstrapClickup = _fakeCu
+    monkeypatch.setitem(
+        sys.modules,
+        "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup",
+        fakeClickup,
+    )
+
+    async def _run():
+        return await consumer._bootstrapJobHandler(
+            {"payload": {"connectionId": "c1", "authority": "clickup"}},
+            lambda *_: None,
+        )
+
+    result = asyncio.run(_run())
+    assert calls == {"cu": 1}
+    assert result["clickup"] == {"indexed": 4}
+
+
 if __name__ == "__main__":
    # Usable without pytest fixtures for a quick smoke run.
    class _MP: