commit before rebase
This commit is contained in:
parent
6a5ff1ff7c
commit
3add5c9a80
8 changed files with 2278 additions and 18 deletions
|
|
@ -135,6 +135,15 @@ async def _bootstrapJobHandler(
|
||||||
|
|
||||||
progressCb(5, f"resolving {authority} connection")
|
progressCb(5, f"resolving {authority} connection")
|
||||||
|
|
||||||
|
def _normalize(res: Any, label: str) -> Dict[str, Any]:
|
||||||
|
if isinstance(res, Exception):
|
||||||
|
logger.error(
|
||||||
|
"ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s",
|
||||||
|
label, connectionId, res, exc_info=res,
|
||||||
|
)
|
||||||
|
return {"error": str(res)}
|
||||||
|
return res or {}
|
||||||
|
|
||||||
if authority == "msft":
|
if authority == "msft":
|
||||||
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
|
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
|
||||||
bootstrapSharepoint,
|
bootstrapSharepoint,
|
||||||
|
|
@ -149,16 +158,6 @@ async def _bootstrapJobHandler(
|
||||||
bootstrapOutlook(connectionId=connectionId, progressCb=progressCb),
|
bootstrapOutlook(connectionId=connectionId, progressCb=progressCb),
|
||||||
return_exceptions=True,
|
return_exceptions=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _normalize(res: Any, label: str) -> Dict[str, Any]:
|
|
||||||
if isinstance(res, Exception):
|
|
||||||
logger.error(
|
|
||||||
"ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s",
|
|
||||||
label, connectionId, res, exc_info=res,
|
|
||||||
)
|
|
||||||
return {"error": str(res)}
|
|
||||||
return res or {}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"connectionId": connectionId,
|
"connectionId": connectionId,
|
||||||
"authority": authority,
|
"authority": authority,
|
||||||
|
|
@ -166,21 +165,55 @@ async def _bootstrapJobHandler(
|
||||||
"outlook": _normalize(olResult, "outlook"),
|
"outlook": _normalize(olResult, "outlook"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if authority == "google":
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import (
|
||||||
|
bootstrapGdrive,
|
||||||
|
)
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
|
||||||
|
bootstrapGmail,
|
||||||
|
)
|
||||||
|
|
||||||
|
progressCb(10, "drive + gmail")
|
||||||
|
gdResult, gmResult = await asyncio.gather(
|
||||||
|
bootstrapGdrive(connectionId=connectionId, progressCb=progressCb),
|
||||||
|
bootstrapGmail(connectionId=connectionId, progressCb=progressCb),
|
||||||
|
return_exceptions=True,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"authority": authority,
|
||||||
|
"drive": _normalize(gdResult, "gdrive"),
|
||||||
|
"gmail": _normalize(gmResult, "gmail"),
|
||||||
|
}
|
||||||
|
|
||||||
|
if authority == "clickup":
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
|
||||||
|
bootstrapClickup,
|
||||||
|
)
|
||||||
|
|
||||||
|
progressCb(10, "clickup tasks")
|
||||||
|
cuResult = await bootstrapClickup(connectionId=connectionId, progressCb=progressCb)
|
||||||
|
return {
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"authority": authority,
|
||||||
|
"clickup": _normalize(cuResult, "clickup"),
|
||||||
|
}
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"ingestion.connection.bootstrap.skipped reason=P1_pilot_scope authority=%s connectionId=%s",
|
"ingestion.connection.bootstrap.skipped reason=unsupported_authority authority=%s connectionId=%s",
|
||||||
authority, connectionId,
|
authority, connectionId,
|
||||||
extra={
|
extra={
|
||||||
"event": "ingestion.connection.bootstrap.skipped",
|
"event": "ingestion.connection.bootstrap.skipped",
|
||||||
"authority": authority,
|
"authority": authority,
|
||||||
"connectionId": connectionId,
|
"connectionId": connectionId,
|
||||||
"reason": "P1_pilot_scope",
|
"reason": "unsupported_authority",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
return {
|
return {
|
||||||
"connectionId": connectionId,
|
"connectionId": connectionId,
|
||||||
"authority": authority,
|
"authority": authority,
|
||||||
"skipped": True,
|
"skipped": True,
|
||||||
"reason": "P1_pilot_scope",
|
"reason": "unsupported_authority",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,489 @@
|
||||||
|
# Copyright (c) 2025 Patrick Motsch
|
||||||
|
# All rights reserved.
|
||||||
|
"""ClickUp bootstrap for the unified knowledge ingestion lane.
|
||||||
|
|
||||||
|
ClickUp tasks are ingested as *virtual documents* — we never download file
|
||||||
|
bytes. Each task becomes a `sourceKind="clickup_task"` IngestionJob whose
|
||||||
|
`contentObjects` carry a summary header (name + status + metadata) and the
|
||||||
|
task description / text content so retrieval finds them without a live API
|
||||||
|
call.
|
||||||
|
|
||||||
|
Hierarchy traversal: workspace (team) → spaces → folders / folderless lists →
|
||||||
|
tasks. We cap the fan-out with `maxWorkspaces` / `maxListsPerWorkspace` /
|
||||||
|
`maxTasks` and skip tasks older than `maxAgeDays` (default 180 d).
|
||||||
|
|
||||||
|
Idempotency: `date_updated` from the ClickUp task payload is a millisecond
|
||||||
|
timestamp and strictly monotonic per revision — used as `contentVersion`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from typing import Any, Callable, Dict, List, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MAX_TASKS_DEFAULT = 500
|
||||||
|
MAX_WORKSPACES_DEFAULT = 3
|
||||||
|
MAX_LISTS_PER_WORKSPACE_DEFAULT = 20
|
||||||
|
MAX_DESCRIPTION_CHARS_DEFAULT = 8000
|
||||||
|
MAX_AGE_DAYS_DEFAULT = 180
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ClickupBootstrapLimits:
|
||||||
|
maxTasks: int = MAX_TASKS_DEFAULT
|
||||||
|
maxWorkspaces: int = MAX_WORKSPACES_DEFAULT
|
||||||
|
maxListsPerWorkspace: int = MAX_LISTS_PER_WORKSPACE_DEFAULT
|
||||||
|
maxDescriptionChars: int = MAX_DESCRIPTION_CHARS_DEFAULT
|
||||||
|
# Only ingest tasks updated within the last N days. None disables filter.
|
||||||
|
maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
|
||||||
|
# Include closed/archived tasks if they still meet the recency filter.
|
||||||
|
# ClickUp `closed` tasks often carry the most useful RAG context
|
||||||
|
# ("why was this shipped the way it was?").
|
||||||
|
includeClosed: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ClickupBootstrapResult:
|
||||||
|
connectionId: str
|
||||||
|
indexed: int = 0
|
||||||
|
skippedDuplicate: int = 0
|
||||||
|
skippedPolicy: int = 0
|
||||||
|
failed: int = 0
|
||||||
|
workspaces: int = 0
|
||||||
|
lists: int = 0
|
||||||
|
errors: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def _syntheticTaskId(connectionId: str, taskId: str) -> str:
|
||||||
|
token = hashlib.sha256(f"{connectionId}:{taskId}".encode("utf-8")).hexdigest()[:16]
|
||||||
|
return f"cu:{connectionId[:8]}:{token}"
|
||||||
|
|
||||||
|
|
||||||
|
def _truncate(value: Any, limit: int) -> str:
|
||||||
|
text = str(value or "").strip()
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
if len(text) <= limit:
|
||||||
|
return text
|
||||||
|
return text[:limit].rstrip() + "\n[truncated]"
|
||||||
|
|
||||||
|
|
||||||
|
def _isRecent(dateUpdatedMs: Any, maxAgeDays: Optional[int]) -> bool:
|
||||||
|
if not maxAgeDays:
|
||||||
|
return True
|
||||||
|
if not dateUpdatedMs:
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
ts = datetime.fromtimestamp(int(dateUpdatedMs) / 1000.0, tz=timezone.utc)
|
||||||
|
except Exception:
|
||||||
|
return True
|
||||||
|
cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
|
||||||
|
return ts >= cutoff
|
||||||
|
|
||||||
|
|
||||||
|
def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -> List[Dict[str, Any]]:
|
||||||
|
"""Header (name/status/metadata) + description + text_content, all text."""
|
||||||
|
name = task.get("name") or f"Task {task.get('id', '')}"
|
||||||
|
status = ((task.get("status") or {}).get("status")) or ""
|
||||||
|
assignees = ", ".join(
|
||||||
|
filter(None, [
|
||||||
|
(a.get("username") or a.get("email") or "")
|
||||||
|
for a in (task.get("assignees") or [])
|
||||||
|
])
|
||||||
|
)
|
||||||
|
tags = ", ".join(filter(None, [t.get("name", "") for t in (task.get("tags") or [])]))
|
||||||
|
listInfo = task.get("list") or {}
|
||||||
|
folderInfo = task.get("folder") or {}
|
||||||
|
spaceInfo = task.get("space") or {}
|
||||||
|
dueMs = task.get("due_date")
|
||||||
|
dueIso = ""
|
||||||
|
if dueMs:
|
||||||
|
try:
|
||||||
|
dueIso = datetime.fromtimestamp(int(dueMs) / 1000.0, tz=timezone.utc).strftime("%Y-%m-%d")
|
||||||
|
except Exception:
|
||||||
|
dueIso = ""
|
||||||
|
|
||||||
|
headerLines = [
|
||||||
|
f"Task: {name}",
|
||||||
|
f"Status: {status}" if status else "",
|
||||||
|
f"List: {listInfo.get('name', '')}" if listInfo else "",
|
||||||
|
f"Folder: {folderInfo.get('name', '')}" if folderInfo else "",
|
||||||
|
f"Space: {spaceInfo.get('name', '')}" if spaceInfo else "",
|
||||||
|
f"Assignees: {assignees}" if assignees else "",
|
||||||
|
f"Tags: {tags}" if tags else "",
|
||||||
|
f"Due: {dueIso}" if dueIso else "",
|
||||||
|
f"Url: {task.get('url', '')}" if task.get("url") else "",
|
||||||
|
]
|
||||||
|
header = "\n".join(line for line in headerLines if line)
|
||||||
|
|
||||||
|
parts: List[Dict[str, Any]] = [{
|
||||||
|
"contentObjectId": "header",
|
||||||
|
"contentType": "text",
|
||||||
|
"data": header,
|
||||||
|
"contextRef": {"part": "header"},
|
||||||
|
}]
|
||||||
|
|
||||||
|
description = _truncate(task.get("description"), limits.maxDescriptionChars)
|
||||||
|
if description:
|
||||||
|
parts.append({
|
||||||
|
"contentObjectId": "description",
|
||||||
|
"contentType": "text",
|
||||||
|
"data": description,
|
||||||
|
"contextRef": {"part": "description"},
|
||||||
|
})
|
||||||
|
# text_content is ClickUp's rendered-markdown version; include if it adds
|
||||||
|
# something beyond the plain description (common for bullet lists, checklists).
|
||||||
|
textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars)
|
||||||
|
if textContent and textContent != description:
|
||||||
|
parts.append({
|
||||||
|
"contentObjectId": "text_content",
|
||||||
|
"contentType": "text",
|
||||||
|
"data": textContent,
|
||||||
|
"contextRef": {"part": "text_content"},
|
||||||
|
})
|
||||||
|
return parts
|
||||||
|
|
||||||
|
|
||||||
|
async def bootstrapClickup(
|
||||||
|
connectionId: str,
|
||||||
|
*,
|
||||||
|
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
|
||||||
|
adapter: Any = None,
|
||||||
|
connection: Any = None,
|
||||||
|
knowledgeService: Any = None,
|
||||||
|
limits: Optional[ClickupBootstrapLimits] = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Walk workspaces → lists → tasks and ingest each task as a virtual doc."""
|
||||||
|
limits = limits or ClickupBootstrapLimits()
|
||||||
|
startMs = time.time()
|
||||||
|
result = ClickupBootstrapResult(connectionId=connectionId)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"ingestion.connection.bootstrap.started part=clickup connectionId=%s",
|
||||||
|
connectionId,
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.connection.bootstrap.started",
|
||||||
|
"part": "clickup",
|
||||||
|
"connectionId": connectionId,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if adapter is None or knowledgeService is None or connection is None:
|
||||||
|
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
|
||||||
|
|
||||||
|
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
|
||||||
|
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
|
||||||
|
|
||||||
|
svc = getattr(adapter, "_svc", None)
|
||||||
|
if svc is None:
|
||||||
|
result.errors.append("adapter missing _svc instance")
|
||||||
|
return _finalizeResult(connectionId, result, startMs)
|
||||||
|
|
||||||
|
try:
|
||||||
|
teamsResp = await svc.getAuthorizedTeams()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("clickup team discovery failed for %s: %s", connectionId, exc, exc_info=True)
|
||||||
|
result.errors.append(f"teams: {exc}")
|
||||||
|
return _finalizeResult(connectionId, result, startMs)
|
||||||
|
|
||||||
|
teams = (teamsResp or {}).get("teams") or []
|
||||||
|
for team in teams[: limits.maxWorkspaces]:
|
||||||
|
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
|
||||||
|
break
|
||||||
|
teamId = str(team.get("id", "") or "")
|
||||||
|
if not teamId:
|
||||||
|
continue
|
||||||
|
result.workspaces += 1
|
||||||
|
try:
|
||||||
|
await _walkTeam(
|
||||||
|
svc=svc,
|
||||||
|
knowledgeService=knowledgeService,
|
||||||
|
connectionId=connectionId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
userId=userId,
|
||||||
|
team=team,
|
||||||
|
limits=limits,
|
||||||
|
result=result,
|
||||||
|
progressCb=progressCb,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("clickup team %s walk failed: %s", teamId, exc, exc_info=True)
|
||||||
|
result.errors.append(f"team({teamId}): {exc}")
|
||||||
|
|
||||||
|
return _finalizeResult(connectionId, result, startMs)
|
||||||
|
|
||||||
|
|
||||||
|
async def _resolveDependencies(connectionId: str):
|
||||||
|
from modules.interfaces.interfaceDbApp import getRootInterface
|
||||||
|
from modules.auth import TokenManager
|
||||||
|
from modules.connectors.providerClickup.connectorClickup import ClickupConnector
|
||||||
|
from modules.serviceCenter import getService
|
||||||
|
from modules.serviceCenter.context import ServiceCenterContext
|
||||||
|
from modules.security.rootAccess import getRootUser
|
||||||
|
|
||||||
|
rootInterface = getRootInterface()
|
||||||
|
connection = rootInterface.getUserConnectionById(connectionId)
|
||||||
|
if connection is None:
|
||||||
|
raise ValueError(f"UserConnection not found: {connectionId}")
|
||||||
|
|
||||||
|
token = TokenManager().getFreshToken(connectionId)
|
||||||
|
if not token or not token.tokenAccess:
|
||||||
|
raise ValueError(f"No valid token for connection {connectionId}")
|
||||||
|
|
||||||
|
provider = ClickupConnector(connection, token.tokenAccess)
|
||||||
|
adapter = provider.getServiceAdapter("clickup")
|
||||||
|
|
||||||
|
rootUser = getRootUser()
|
||||||
|
ctx = ServiceCenterContext(
|
||||||
|
user=rootUser,
|
||||||
|
mandate_id=str(getattr(connection, "mandateId", "") or ""),
|
||||||
|
)
|
||||||
|
knowledgeService = getService("knowledge", ctx)
|
||||||
|
return adapter, connection, knowledgeService
|
||||||
|
|
||||||
|
|
||||||
|
async def _walkTeam(
|
||||||
|
*,
|
||||||
|
svc,
|
||||||
|
knowledgeService,
|
||||||
|
connectionId: str,
|
||||||
|
mandateId: str,
|
||||||
|
userId: str,
|
||||||
|
team: Dict[str, Any],
|
||||||
|
limits: ClickupBootstrapLimits,
|
||||||
|
result: ClickupBootstrapResult,
|
||||||
|
progressCb: Optional[Callable[[int, Optional[str]], None]],
|
||||||
|
) -> None:
|
||||||
|
teamId = str(team.get("id", "") or "")
|
||||||
|
spacesResp = await svc.getSpaces(teamId)
|
||||||
|
spaces = (spacesResp or {}).get("spaces") or []
|
||||||
|
|
||||||
|
listsCollected: List[Dict[str, Any]] = []
|
||||||
|
for space in spaces:
|
||||||
|
if len(listsCollected) >= limits.maxListsPerWorkspace:
|
||||||
|
break
|
||||||
|
spaceId = str(space.get("id", "") or "")
|
||||||
|
if not spaceId:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Folderless lists directly under the space
|
||||||
|
folderless = await svc.getFolderlessLists(spaceId)
|
||||||
|
for lst in (folderless or {}).get("lists") or []:
|
||||||
|
if len(listsCollected) >= limits.maxListsPerWorkspace:
|
||||||
|
break
|
||||||
|
listsCollected.append({**lst, "_space": space})
|
||||||
|
|
||||||
|
# Lists inside folders
|
||||||
|
foldersResp = await svc.getFolders(spaceId)
|
||||||
|
for folder in (foldersResp or {}).get("folders") or []:
|
||||||
|
if len(listsCollected) >= limits.maxListsPerWorkspace:
|
||||||
|
break
|
||||||
|
folderId = str(folder.get("id", "") or "")
|
||||||
|
if not folderId:
|
||||||
|
continue
|
||||||
|
folderLists = await svc.getListsInFolder(folderId)
|
||||||
|
for lst in (folderLists or {}).get("lists") or []:
|
||||||
|
if len(listsCollected) >= limits.maxListsPerWorkspace:
|
||||||
|
break
|
||||||
|
listsCollected.append({**lst, "_space": space, "_folder": folder})
|
||||||
|
|
||||||
|
for lst in listsCollected:
|
||||||
|
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
|
||||||
|
return
|
||||||
|
result.lists += 1
|
||||||
|
await _walkList(
|
||||||
|
svc=svc,
|
||||||
|
knowledgeService=knowledgeService,
|
||||||
|
connectionId=connectionId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
userId=userId,
|
||||||
|
teamId=teamId,
|
||||||
|
lst=lst,
|
||||||
|
limits=limits,
|
||||||
|
result=result,
|
||||||
|
progressCb=progressCb,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _walkList(
|
||||||
|
*,
|
||||||
|
svc,
|
||||||
|
knowledgeService,
|
||||||
|
connectionId: str,
|
||||||
|
mandateId: str,
|
||||||
|
userId: str,
|
||||||
|
teamId: str,
|
||||||
|
lst: Dict[str, Any],
|
||||||
|
limits: ClickupBootstrapLimits,
|
||||||
|
result: ClickupBootstrapResult,
|
||||||
|
progressCb: Optional[Callable[[int, Optional[str]], None]],
|
||||||
|
) -> None:
|
||||||
|
listId = str(lst.get("id", "") or "")
|
||||||
|
if not listId:
|
||||||
|
return
|
||||||
|
page = 0
|
||||||
|
while result.indexed + result.skippedDuplicate < limits.maxTasks:
|
||||||
|
resp = await svc.getTasksInList(
|
||||||
|
listId,
|
||||||
|
page=page,
|
||||||
|
include_closed=limits.includeClosed,
|
||||||
|
subtasks=True,
|
||||||
|
)
|
||||||
|
if isinstance(resp, dict) and resp.get("error"):
|
||||||
|
logger.warning("clickup tasks list=%s page=%d error: %s", listId, page, resp.get("error"))
|
||||||
|
result.errors.append(f"list({listId}): {resp.get('error')}")
|
||||||
|
return
|
||||||
|
tasks = (resp or {}).get("tasks") or []
|
||||||
|
if not tasks:
|
||||||
|
return
|
||||||
|
|
||||||
|
for task in tasks:
|
||||||
|
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
|
||||||
|
return
|
||||||
|
if not _isRecent(task.get("date_updated"), limits.maxAgeDays):
|
||||||
|
result.skippedPolicy += 1
|
||||||
|
continue
|
||||||
|
# Inject the list/folder/space metadata we already loaded.
|
||||||
|
task["list"] = task.get("list") or {"id": listId, "name": lst.get("name")}
|
||||||
|
task["folder"] = task.get("folder") or lst.get("_folder") or {}
|
||||||
|
task["space"] = task.get("space") or lst.get("_space") or {}
|
||||||
|
await _ingestTask(
|
||||||
|
knowledgeService=knowledgeService,
|
||||||
|
connectionId=connectionId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
userId=userId,
|
||||||
|
teamId=teamId,
|
||||||
|
task=task,
|
||||||
|
limits=limits,
|
||||||
|
result=result,
|
||||||
|
progressCb=progressCb,
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(tasks) < 100: # ClickUp page-size hint: fewer than 100 => last page
|
||||||
|
return
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
|
||||||
|
async def _ingestTask(
|
||||||
|
*,
|
||||||
|
knowledgeService,
|
||||||
|
connectionId: str,
|
||||||
|
mandateId: str,
|
||||||
|
userId: str,
|
||||||
|
teamId: str,
|
||||||
|
task: Dict[str, Any],
|
||||||
|
limits: ClickupBootstrapLimits,
|
||||||
|
result: ClickupBootstrapResult,
|
||||||
|
progressCb: Optional[Callable[[int, Optional[str]], None]],
|
||||||
|
) -> None:
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
|
||||||
|
|
||||||
|
taskId = str(task.get("id", "") or "")
|
||||||
|
if not taskId:
|
||||||
|
result.skippedPolicy += 1
|
||||||
|
return
|
||||||
|
revision = str(task.get("date_updated") or task.get("date_created") or "")
|
||||||
|
name = task.get("name") or f"Task {taskId}"
|
||||||
|
syntheticId = _syntheticTaskId(connectionId, taskId)
|
||||||
|
fileName = f"{name[:80].strip() or taskId}.task.json"
|
||||||
|
|
||||||
|
contentObjects = _buildContentObjects(task, limits)
|
||||||
|
|
||||||
|
try:
|
||||||
|
handle = await knowledgeService.requestIngestion(
|
||||||
|
IngestionJob(
|
||||||
|
sourceKind="clickup_task",
|
||||||
|
sourceId=syntheticId,
|
||||||
|
fileName=fileName,
|
||||||
|
mimeType="application/vnd.clickup.task+json",
|
||||||
|
userId=userId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
contentObjects=contentObjects,
|
||||||
|
contentVersion=revision or None,
|
||||||
|
provenance={
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"authority": "clickup",
|
||||||
|
"service": "clickup",
|
||||||
|
"externalItemId": taskId,
|
||||||
|
"teamId": teamId,
|
||||||
|
"listId": ((task.get("list") or {}).get("id")),
|
||||||
|
"spaceId": ((task.get("space") or {}).get("id")),
|
||||||
|
"url": task.get("url"),
|
||||||
|
"status": ((task.get("status") or {}).get("status")),
|
||||||
|
"tier": "body",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("clickup ingestion %s failed: %s", taskId, exc, exc_info=True)
|
||||||
|
result.failed += 1
|
||||||
|
result.errors.append(f"ingest({taskId}): {exc}")
|
||||||
|
return
|
||||||
|
|
||||||
|
if handle.status == "duplicate":
|
||||||
|
result.skippedDuplicate += 1
|
||||||
|
elif handle.status == "indexed":
|
||||||
|
result.indexed += 1
|
||||||
|
else:
|
||||||
|
result.failed += 1
|
||||||
|
|
||||||
|
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
|
||||||
|
processed = result.indexed + result.skippedDuplicate
|
||||||
|
try:
|
||||||
|
progressCb(
|
||||||
|
min(90, 10 + int(80 * processed / max(1, limits.maxTasks))),
|
||||||
|
f"clickup processed={processed}",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
logger.info(
|
||||||
|
"ingestion.connection.bootstrap.progress part=clickup processed=%d skippedDup=%d failed=%d",
|
||||||
|
processed, result.skippedDuplicate, result.failed,
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.connection.bootstrap.progress",
|
||||||
|
"part": "clickup",
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"processed": processed,
|
||||||
|
"skippedDup": result.skippedDuplicate,
|
||||||
|
"failed": result.failed,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _finalizeResult(connectionId: str, result: ClickupBootstrapResult, startMs: float) -> Dict[str, Any]:
|
||||||
|
durationMs = int((time.time() - startMs) * 1000)
|
||||||
|
logger.info(
|
||||||
|
"ingestion.connection.bootstrap.done part=clickup connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d workspaces=%d lists=%d durationMs=%d",
|
||||||
|
connectionId,
|
||||||
|
result.indexed, result.skippedDuplicate, result.skippedPolicy,
|
||||||
|
result.failed, result.workspaces, result.lists, durationMs,
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.connection.bootstrap.done",
|
||||||
|
"part": "clickup",
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"indexed": result.indexed,
|
||||||
|
"skippedDup": result.skippedDuplicate,
|
||||||
|
"skippedPolicy": result.skippedPolicy,
|
||||||
|
"failed": result.failed,
|
||||||
|
"workspaces": result.workspaces,
|
||||||
|
"lists": result.lists,
|
||||||
|
"durationMs": durationMs,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"connectionId": result.connectionId,
|
||||||
|
"indexed": result.indexed,
|
||||||
|
"skippedDuplicate": result.skippedDuplicate,
|
||||||
|
"skippedPolicy": result.skippedPolicy,
|
||||||
|
"failed": result.failed,
|
||||||
|
"workspaces": result.workspaces,
|
||||||
|
"lists": result.lists,
|
||||||
|
"durationMs": durationMs,
|
||||||
|
"errors": result.errors[:20],
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,429 @@
|
||||||
|
# Copyright (c) 2025 Patrick Motsch
|
||||||
|
# All rights reserved.
|
||||||
|
"""Google Drive bootstrap for the unified knowledge ingestion lane.
|
||||||
|
|
||||||
|
Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the
|
||||||
|
user's *My Drive* tree from the virtual `root` folder, downloads each
|
||||||
|
file-like item via `DriveAdapter.download` (which handles native Google docs
|
||||||
|
via export), runs the standard extraction pipeline and routes results through
|
||||||
|
`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and
|
||||||
|
`contentVersion = modifiedTime` (monotonic per-revision).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from typing import Any, Callable, Dict, List, Optional
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelExtraction import ExtractionOptions
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MAX_ITEMS_DEFAULT = 500
|
||||||
|
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
|
||||||
|
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
|
||||||
|
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
|
||||||
|
MAX_DEPTH_DEFAULT = 4
|
||||||
|
MAX_AGE_DAYS_DEFAULT = 365
|
||||||
|
|
||||||
|
# Google Drive uses virtual mime-types for folders and non-downloadable assets.
|
||||||
|
FOLDER_MIME = "application/vnd.google-apps.folder"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GdriveBootstrapLimits:
|
||||||
|
maxItems: int = MAX_ITEMS_DEFAULT
|
||||||
|
maxBytes: int = MAX_BYTES_DEFAULT
|
||||||
|
maxFileSize: int = MAX_FILE_SIZE_DEFAULT
|
||||||
|
skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
|
||||||
|
maxDepth: int = MAX_DEPTH_DEFAULT
|
||||||
|
# Only ingest files modified within the last N days. None disables filter.
|
||||||
|
maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GdriveBootstrapResult:
|
||||||
|
connectionId: str
|
||||||
|
indexed: int = 0
|
||||||
|
skippedDuplicate: int = 0
|
||||||
|
skippedPolicy: int = 0
|
||||||
|
failed: int = 0
|
||||||
|
bytesProcessed: int = 0
|
||||||
|
errors: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
|
||||||
|
token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
|
||||||
|
return f"gd:{connectionId[:8]}:{token}"
|
||||||
|
|
||||||
|
|
||||||
|
def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
|
||||||
|
parts = getattr(extracted, "parts", None) or []
|
||||||
|
out: List[Dict[str, Any]] = []
|
||||||
|
for part in parts:
|
||||||
|
data = getattr(part, "data", None) or ""
|
||||||
|
if not data or not str(data).strip():
|
||||||
|
continue
|
||||||
|
typeGroup = getattr(part, "typeGroup", "text") or "text"
|
||||||
|
contentType = "text"
|
||||||
|
if typeGroup == "image":
|
||||||
|
contentType = "image"
|
||||||
|
elif typeGroup in ("binary", "container"):
|
||||||
|
contentType = "other"
|
||||||
|
out.append({
|
||||||
|
"contentObjectId": getattr(part, "id", ""),
|
||||||
|
"contentType": contentType,
|
||||||
|
"data": data,
|
||||||
|
"contextRef": {
|
||||||
|
"containerPath": fileName,
|
||||||
|
"location": getattr(part, "label", None) or "file",
|
||||||
|
**(getattr(part, "metadata", None) or {}),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
|
||||||
|
if not maxAgeDays:
|
||||||
|
return True
|
||||||
|
if not modifiedIso:
|
||||||
|
# No timestamp -> be permissive (Drive native docs sometimes omit it on export).
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
# Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both.
|
||||||
|
ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00"))
|
||||||
|
except Exception:
|
||||||
|
return True
|
||||||
|
cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
|
||||||
|
if ts.tzinfo is None:
|
||||||
|
ts = ts.replace(tzinfo=timezone.utc)
|
||||||
|
return ts >= cutoff
|
||||||
|
|
||||||
|
|
||||||
|
async def bootstrapGdrive(
|
||||||
|
connectionId: str,
|
||||||
|
*,
|
||||||
|
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
|
||||||
|
adapter: Any = None,
|
||||||
|
connection: Any = None,
|
||||||
|
knowledgeService: Any = None,
|
||||||
|
limits: Optional[GdriveBootstrapLimits] = None,
|
||||||
|
runExtractionFn: Optional[Callable[..., Any]] = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Walk My Drive starting from the virtual root folder."""
|
||||||
|
limits = limits or GdriveBootstrapLimits()
|
||||||
|
startMs = time.time()
|
||||||
|
result = GdriveBootstrapResult(connectionId=connectionId)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"ingestion.connection.bootstrap.started part=gdrive connectionId=%s",
|
||||||
|
connectionId,
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.connection.bootstrap.started",
|
||||||
|
"part": "gdrive",
|
||||||
|
"connectionId": connectionId,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if adapter is None or knowledgeService is None or connection is None:
|
||||||
|
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
|
||||||
|
if runExtractionFn is None:
|
||||||
|
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
|
||||||
|
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
|
||||||
|
ExtractorRegistry, ChunkerRegistry,
|
||||||
|
)
|
||||||
|
extractorRegistry = ExtractorRegistry()
|
||||||
|
chunkerRegistry = ChunkerRegistry()
|
||||||
|
|
||||||
|
def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
|
||||||
|
return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
|
||||||
|
|
||||||
|
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
|
||||||
|
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
await _walkFolder(
|
||||||
|
adapter=adapter,
|
||||||
|
knowledgeService=knowledgeService,
|
||||||
|
runExtractionFn=runExtractionFn,
|
||||||
|
connectionId=connectionId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
userId=userId,
|
||||||
|
folderPath="/", # DriveAdapter.browse maps "" / "/" -> "root"
|
||||||
|
depth=0,
|
||||||
|
limits=limits,
|
||||||
|
result=result,
|
||||||
|
progressCb=progressCb,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True)
|
||||||
|
result.errors.append(f"walk: {exc}")
|
||||||
|
|
||||||
|
return _finalizeResult(connectionId, result, startMs)
|
||||||
|
|
||||||
|
|
||||||
|
async def _resolveDependencies(connectionId: str):
|
||||||
|
from modules.interfaces.interfaceDbApp import getRootInterface
|
||||||
|
from modules.auth import TokenManager
|
||||||
|
from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
|
||||||
|
from modules.serviceCenter import getService
|
||||||
|
from modules.serviceCenter.context import ServiceCenterContext
|
||||||
|
from modules.security.rootAccess import getRootUser
|
||||||
|
|
||||||
|
rootInterface = getRootInterface()
|
||||||
|
connection = rootInterface.getUserConnectionById(connectionId)
|
||||||
|
if connection is None:
|
||||||
|
raise ValueError(f"UserConnection not found: {connectionId}")
|
||||||
|
|
||||||
|
token = TokenManager().getFreshToken(connectionId)
|
||||||
|
if not token or not token.tokenAccess:
|
||||||
|
raise ValueError(f"No valid token for connection {connectionId}")
|
||||||
|
|
||||||
|
provider = GoogleConnector(connection, token.tokenAccess)
|
||||||
|
adapter = provider.getServiceAdapter("drive")
|
||||||
|
|
||||||
|
rootUser = getRootUser()
|
||||||
|
ctx = ServiceCenterContext(
|
||||||
|
user=rootUser,
|
||||||
|
mandate_id=str(getattr(connection, "mandateId", "") or ""),
|
||||||
|
)
|
||||||
|
knowledgeService = getService("knowledge", ctx)
|
||||||
|
return adapter, connection, knowledgeService
|
||||||
|
|
||||||
|
|
||||||
|
async def _walkFolder(
|
||||||
|
*,
|
||||||
|
adapter,
|
||||||
|
knowledgeService,
|
||||||
|
runExtractionFn,
|
||||||
|
connectionId: str,
|
||||||
|
mandateId: str,
|
||||||
|
userId: str,
|
||||||
|
folderPath: str,
|
||||||
|
depth: int,
|
||||||
|
limits: GdriveBootstrapLimits,
|
||||||
|
result: GdriveBootstrapResult,
|
||||||
|
progressCb: Optional[Callable[[int, Optional[str]], None]],
|
||||||
|
) -> None:
|
||||||
|
if depth > limits.maxDepth:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
entries = await adapter.browse(folderPath)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("gdrive browse %s failed: %s", folderPath, exc)
|
||||||
|
result.errors.append(f"browse({folderPath}): {exc}")
|
||||||
|
return
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
if result.indexed + result.skippedDuplicate >= limits.maxItems:
|
||||||
|
return
|
||||||
|
if result.bytesProcessed >= limits.maxBytes:
|
||||||
|
return
|
||||||
|
|
||||||
|
entryPath = getattr(entry, "path", "") or ""
|
||||||
|
metadata = getattr(entry, "metadata", {}) or {}
|
||||||
|
mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType")
|
||||||
|
|
||||||
|
if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME:
|
||||||
|
await _walkFolder(
|
||||||
|
adapter=adapter,
|
||||||
|
knowledgeService=knowledgeService,
|
||||||
|
runExtractionFn=runExtractionFn,
|
||||||
|
connectionId=connectionId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
userId=userId,
|
||||||
|
folderPath=entryPath,
|
||||||
|
depth=depth + 1,
|
||||||
|
limits=limits,
|
||||||
|
result=result,
|
||||||
|
progressCb=progressCb,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
effectiveMime = mimeType or "application/octet-stream"
|
||||||
|
if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes):
|
||||||
|
result.skippedPolicy += 1
|
||||||
|
continue
|
||||||
|
size = int(getattr(entry, "size", 0) or 0)
|
||||||
|
if size and size > limits.maxFileSize:
|
||||||
|
result.skippedPolicy += 1
|
||||||
|
continue
|
||||||
|
modifiedTime = metadata.get("modifiedTime")
|
||||||
|
if not _isRecent(modifiedTime, limits.maxAgeDays):
|
||||||
|
result.skippedPolicy += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
externalItemId = metadata.get("id") or entryPath
|
||||||
|
revision = modifiedTime
|
||||||
|
|
||||||
|
await _ingestOne(
|
||||||
|
adapter=adapter,
|
||||||
|
knowledgeService=knowledgeService,
|
||||||
|
runExtractionFn=runExtractionFn,
|
||||||
|
connectionId=connectionId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
userId=userId,
|
||||||
|
entry=entry,
|
||||||
|
entryPath=entryPath,
|
||||||
|
mimeType=effectiveMime,
|
||||||
|
externalItemId=externalItemId,
|
||||||
|
revision=revision,
|
||||||
|
limits=limits,
|
||||||
|
result=result,
|
||||||
|
progressCb=progressCb,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _ingestOne(
|
||||||
|
*,
|
||||||
|
adapter,
|
||||||
|
knowledgeService,
|
||||||
|
runExtractionFn,
|
||||||
|
connectionId: str,
|
||||||
|
mandateId: str,
|
||||||
|
userId: str,
|
||||||
|
entry,
|
||||||
|
entryPath: str,
|
||||||
|
mimeType: str,
|
||||||
|
externalItemId: str,
|
||||||
|
revision: Optional[str],
|
||||||
|
limits: GdriveBootstrapLimits,
|
||||||
|
result: GdriveBootstrapResult,
|
||||||
|
progressCb: Optional[Callable[[int, Optional[str]], None]],
|
||||||
|
) -> None:
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
|
||||||
|
|
||||||
|
syntheticFileId = _syntheticFileId(connectionId, externalItemId)
|
||||||
|
fileName = getattr(entry, "name", "") or externalItemId
|
||||||
|
|
||||||
|
try:
|
||||||
|
downloaded = await adapter.download(entryPath)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("gdrive download %s failed: %s", entryPath, exc)
|
||||||
|
result.failed += 1
|
||||||
|
result.errors.append(f"download({entryPath}): {exc}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Adapter.download returns raw bytes today; guard DownloadResult shape too.
|
||||||
|
fileBytes: bytes
|
||||||
|
if isinstance(downloaded, (bytes, bytearray)):
|
||||||
|
fileBytes = bytes(downloaded)
|
||||||
|
else:
|
||||||
|
fileBytes = bytes(getattr(downloaded, "data", b"") or b"")
|
||||||
|
if getattr(downloaded, "mimeType", None):
|
||||||
|
mimeType = downloaded.mimeType # export may have changed the type
|
||||||
|
if not fileBytes:
|
||||||
|
result.failed += 1
|
||||||
|
return
|
||||||
|
if len(fileBytes) > limits.maxFileSize:
|
||||||
|
result.skippedPolicy += 1
|
||||||
|
return
|
||||||
|
|
||||||
|
result.bytesProcessed += len(fileBytes)
|
||||||
|
|
||||||
|
try:
|
||||||
|
extracted = runExtractionFn(
|
||||||
|
fileBytes, fileName, mimeType,
|
||||||
|
ExtractionOptions(mergeStrategy=None),
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("gdrive extraction %s failed: %s", entryPath, exc)
|
||||||
|
result.failed += 1
|
||||||
|
result.errors.append(f"extract({entryPath}): {exc}")
|
||||||
|
return
|
||||||
|
|
||||||
|
contentObjects = _toContentObjects(extracted, fileName)
|
||||||
|
if not contentObjects:
|
||||||
|
result.skippedPolicy += 1
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
handle = await knowledgeService.requestIngestion(
|
||||||
|
IngestionJob(
|
||||||
|
sourceKind="gdrive_item",
|
||||||
|
sourceId=syntheticFileId,
|
||||||
|
fileName=fileName,
|
||||||
|
mimeType=mimeType,
|
||||||
|
userId=userId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
contentObjects=contentObjects,
|
||||||
|
contentVersion=revision,
|
||||||
|
provenance={
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"authority": "google",
|
||||||
|
"service": "drive",
|
||||||
|
"externalItemId": externalItemId,
|
||||||
|
"entryPath": entryPath,
|
||||||
|
"tier": "body",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True)
|
||||||
|
result.failed += 1
|
||||||
|
result.errors.append(f"ingest({entryPath}): {exc}")
|
||||||
|
return
|
||||||
|
|
||||||
|
if handle.status == "duplicate":
|
||||||
|
result.skippedDuplicate += 1
|
||||||
|
elif handle.status == "indexed":
|
||||||
|
result.indexed += 1
|
||||||
|
else:
|
||||||
|
result.failed += 1
|
||||||
|
|
||||||
|
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
|
||||||
|
processed = result.indexed + result.skippedDuplicate
|
||||||
|
try:
|
||||||
|
progressCb(
|
||||||
|
min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
|
||||||
|
f"gdrive processed={processed}",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
logger.info(
|
||||||
|
"ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d",
|
||||||
|
processed, result.skippedDuplicate, result.failed,
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.connection.bootstrap.progress",
|
||||||
|
"part": "gdrive",
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"processed": processed,
|
||||||
|
"skippedDup": result.skippedDuplicate,
|
||||||
|
"failed": result.failed,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]:
|
||||||
|
durationMs = int((time.time() - startMs) * 1000)
|
||||||
|
logger.info(
|
||||||
|
"ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d",
|
||||||
|
connectionId,
|
||||||
|
result.indexed, result.skippedDuplicate, result.skippedPolicy,
|
||||||
|
result.failed, result.bytesProcessed, durationMs,
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.connection.bootstrap.done",
|
||||||
|
"part": "gdrive",
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"indexed": result.indexed,
|
||||||
|
"skippedDup": result.skippedDuplicate,
|
||||||
|
"skippedPolicy": result.skippedPolicy,
|
||||||
|
"failed": result.failed,
|
||||||
|
"bytes": result.bytesProcessed,
|
||||||
|
"durationMs": durationMs,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"connectionId": result.connectionId,
|
||||||
|
"indexed": result.indexed,
|
||||||
|
"skippedDuplicate": result.skippedDuplicate,
|
||||||
|
"skippedPolicy": result.skippedPolicy,
|
||||||
|
"failed": result.failed,
|
||||||
|
"bytesProcessed": result.bytesProcessed,
|
||||||
|
"durationMs": durationMs,
|
||||||
|
"errors": result.errors[:20],
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,578 @@
|
||||||
|
# Copyright (c) 2025 Patrick Motsch
|
||||||
|
# All rights reserved.
|
||||||
|
"""Gmail bootstrap for the unified knowledge ingestion lane.
|
||||||
|
|
||||||
|
Mirrors the Outlook pilot (see subConnectorSyncOutlook.py) but talks to Google
|
||||||
|
Mail's REST API. Messages become `sourceKind="gmail_message"` virtual documents
|
||||||
|
with header / snippet / cleaned body content-objects; attachments are optional
|
||||||
|
child jobs with `sourceKind="gmail_attachment"`.
|
||||||
|
|
||||||
|
Idempotency: Gmail's stable `historyId` (or `internalDate` as fallback) is
|
||||||
|
passed as `contentVersion`, so rerunning the bootstrap yields
|
||||||
|
`ingestion.skipped.duplicate` for unchanged messages.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from typing import Any, Callable, Dict, List, Optional
|
||||||
|
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MAX_MESSAGES_DEFAULT = 500
|
||||||
|
MAX_BODY_CHARS_DEFAULT = 8000
|
||||||
|
MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
|
||||||
|
DEFAULT_LABELS = ("INBOX", "SENT")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GmailBootstrapLimits:
|
||||||
|
maxMessages: int = MAX_MESSAGES_DEFAULT
|
||||||
|
labels: tuple = DEFAULT_LABELS
|
||||||
|
maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
|
||||||
|
includeAttachments: bool = False
|
||||||
|
maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
|
||||||
|
# Only fetch messages newer than N days. None disables filter.
|
||||||
|
maxAgeDays: Optional[int] = 90
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GmailBootstrapResult:
|
||||||
|
connectionId: str
|
||||||
|
indexed: int = 0
|
||||||
|
skippedDuplicate: int = 0
|
||||||
|
skippedPolicy: int = 0
|
||||||
|
failed: int = 0
|
||||||
|
attachmentsIndexed: int = 0
|
||||||
|
errors: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def _syntheticMessageId(connectionId: str, messageId: str) -> str:
|
||||||
|
token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
|
||||||
|
return f"gm:{connectionId[:8]}:{token}"
|
||||||
|
|
||||||
|
|
||||||
|
def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
|
||||||
|
token = hashlib.sha256(
|
||||||
|
f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
|
||||||
|
).hexdigest()[:16]
|
||||||
|
return f"ga:{connectionId[:8]}:{token}"
|
||||||
|
|
||||||
|
|
||||||
|
def _decodeBase64Url(data: str) -> bytes:
|
||||||
|
if not data:
|
||||||
|
return b""
|
||||||
|
# Gmail uses URL-safe base64 without padding.
|
||||||
|
padding = 4 - (len(data) % 4)
|
||||||
|
if padding != 4:
|
||||||
|
data = data + ("=" * padding)
|
||||||
|
try:
|
||||||
|
return base64.urlsafe_b64decode(data)
|
||||||
|
except Exception:
|
||||||
|
return b""
|
||||||
|
|
||||||
|
|
||||||
|
def _walkPayloadForBody(payload: Dict[str, Any]) -> Dict[str, str]:
|
||||||
|
"""Return {"text": ..., "html": ...} by walking MIME parts.
|
||||||
|
|
||||||
|
Gmail `payload` is a tree of parts. We prefer `text/plain` for the cleaned
|
||||||
|
body, but capture `text/html` as a fallback so `cleanEmailBody` can strip
|
||||||
|
markup if plain is missing.
|
||||||
|
"""
|
||||||
|
found: Dict[str, str] = {"text": "", "html": ""}
|
||||||
|
|
||||||
|
def _walk(part: Dict[str, Any]) -> None:
|
||||||
|
mime = (part.get("mimeType") or "").lower()
|
||||||
|
body = part.get("body") or {}
|
||||||
|
raw = body.get("data") or ""
|
||||||
|
if raw and mime.startswith("text/"):
|
||||||
|
decoded = _decodeBase64Url(raw).decode("utf-8", errors="replace")
|
||||||
|
key = "text" if mime == "text/plain" else ("html" if mime == "text/html" else "")
|
||||||
|
if key and not found[key]:
|
||||||
|
found[key] = decoded
|
||||||
|
for sub in part.get("parts") or []:
|
||||||
|
_walk(sub)
|
||||||
|
|
||||||
|
_walk(payload or {})
|
||||||
|
return found
|
||||||
|
|
||||||
|
|
||||||
|
def _headerMap(payload: Dict[str, Any]) -> Dict[str, str]:
|
||||||
|
return {
|
||||||
|
(h.get("name") or "").lower(): (h.get("value") or "")
|
||||||
|
for h in (payload.get("headers") or [])
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _buildContentObjects(message: Dict[str, Any], maxBodyChars: int) -> List[Dict[str, Any]]:
|
||||||
|
payload = message.get("payload") or {}
|
||||||
|
headers = _headerMap(payload)
|
||||||
|
subject = headers.get("subject") or "(no subject)"
|
||||||
|
fromAddr = headers.get("from") or ""
|
||||||
|
toAddr = headers.get("to") or ""
|
||||||
|
ccAddr = headers.get("cc") or ""
|
||||||
|
date = headers.get("date") or ""
|
||||||
|
snippet = message.get("snippet") or ""
|
||||||
|
|
||||||
|
bodies = _walkPayloadForBody(payload)
|
||||||
|
rawBody = bodies["text"] or bodies["html"]
|
||||||
|
cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else ""
|
||||||
|
|
||||||
|
parts: List[Dict[str, Any]] = []
|
||||||
|
header = (
|
||||||
|
f"Subject: {subject}\n"
|
||||||
|
f"From: {fromAddr}\n"
|
||||||
|
f"To: {toAddr}\n"
|
||||||
|
+ (f"Cc: {ccAddr}\n" if ccAddr else "")
|
||||||
|
+ f"Date: {date}"
|
||||||
|
)
|
||||||
|
parts.append({
|
||||||
|
"contentObjectId": "header",
|
||||||
|
"contentType": "text",
|
||||||
|
"data": header,
|
||||||
|
"contextRef": {"part": "header"},
|
||||||
|
})
|
||||||
|
if snippet:
|
||||||
|
parts.append({
|
||||||
|
"contentObjectId": "snippet",
|
||||||
|
"contentType": "text",
|
||||||
|
"data": snippet,
|
||||||
|
"contextRef": {"part": "snippet"},
|
||||||
|
})
|
||||||
|
if cleanedBody:
|
||||||
|
parts.append({
|
||||||
|
"contentObjectId": "body",
|
||||||
|
"contentType": "text",
|
||||||
|
"data": cleanedBody,
|
||||||
|
"contextRef": {"part": "body"},
|
||||||
|
})
|
||||||
|
return parts
|
||||||
|
|
||||||
|
|
||||||
|
async def bootstrapGmail(
|
||||||
|
connectionId: str,
|
||||||
|
*,
|
||||||
|
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
|
||||||
|
adapter: Any = None,
|
||||||
|
connection: Any = None,
|
||||||
|
knowledgeService: Any = None,
|
||||||
|
limits: Optional[GmailBootstrapLimits] = None,
|
||||||
|
googleGetFn: Optional[Callable[..., Any]] = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Enumerate Gmail labels (INBOX + SENT default) and ingest messages."""
|
||||||
|
limits = limits or GmailBootstrapLimits()
|
||||||
|
startMs = time.time()
|
||||||
|
result = GmailBootstrapResult(connectionId=connectionId)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"ingestion.connection.bootstrap.started part=gmail connectionId=%s",
|
||||||
|
connectionId,
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.connection.bootstrap.started",
|
||||||
|
"part": "gmail",
|
||||||
|
"connectionId": connectionId,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if adapter is None or knowledgeService is None or connection is None:
|
||||||
|
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
|
||||||
|
|
||||||
|
if googleGetFn is None:
|
||||||
|
from modules.connectors.providerGoogle.connectorGoogle import _googleGet as _defaultGet
|
||||||
|
|
||||||
|
token = getattr(adapter, "_token", "")
|
||||||
|
|
||||||
|
async def googleGetFn(url: str) -> Dict[str, Any]: # type: ignore[no-redef]
|
||||||
|
return await _defaultGet(token, url)
|
||||||
|
|
||||||
|
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
|
||||||
|
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
|
||||||
|
|
||||||
|
for labelId in limits.labels:
|
||||||
|
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
await _ingestLabel(
|
||||||
|
googleGetFn=googleGetFn,
|
||||||
|
knowledgeService=knowledgeService,
|
||||||
|
connectionId=connectionId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
userId=userId,
|
||||||
|
labelId=labelId,
|
||||||
|
limits=limits,
|
||||||
|
result=result,
|
||||||
|
progressCb=progressCb,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True)
|
||||||
|
result.errors.append(f"label({labelId}): {exc}")
|
||||||
|
|
||||||
|
return _finalizeResult(connectionId, result, startMs)
|
||||||
|
|
||||||
|
|
||||||
|
async def _resolveDependencies(connectionId: str):
|
||||||
|
from modules.interfaces.interfaceDbApp import getRootInterface
|
||||||
|
from modules.auth import TokenManager
|
||||||
|
from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
|
||||||
|
from modules.serviceCenter import getService
|
||||||
|
from modules.serviceCenter.context import ServiceCenterContext
|
||||||
|
from modules.security.rootAccess import getRootUser
|
||||||
|
|
||||||
|
rootInterface = getRootInterface()
|
||||||
|
connection = rootInterface.getUserConnectionById(connectionId)
|
||||||
|
if connection is None:
|
||||||
|
raise ValueError(f"UserConnection not found: {connectionId}")
|
||||||
|
|
||||||
|
token = TokenManager().getFreshToken(connectionId)
|
||||||
|
if not token or not token.tokenAccess:
|
||||||
|
raise ValueError(f"No valid token for connection {connectionId}")
|
||||||
|
|
||||||
|
provider = GoogleConnector(connection, token.tokenAccess)
|
||||||
|
adapter = provider.getServiceAdapter("gmail")
|
||||||
|
|
||||||
|
rootUser = getRootUser()
|
||||||
|
ctx = ServiceCenterContext(
|
||||||
|
user=rootUser,
|
||||||
|
mandate_id=str(getattr(connection, "mandateId", "") or ""),
|
||||||
|
)
|
||||||
|
knowledgeService = getService("knowledge", ctx)
|
||||||
|
return adapter, connection, knowledgeService
|
||||||
|
|
||||||
|
|
||||||
|
async def _ingestLabel(
|
||||||
|
*,
|
||||||
|
googleGetFn,
|
||||||
|
knowledgeService,
|
||||||
|
connectionId: str,
|
||||||
|
mandateId: str,
|
||||||
|
userId: str,
|
||||||
|
labelId: str,
|
||||||
|
limits: GmailBootstrapLimits,
|
||||||
|
result: GmailBootstrapResult,
|
||||||
|
progressCb: Optional[Callable[[int, Optional[str]], None]],
|
||||||
|
) -> None:
|
||||||
|
remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
|
||||||
|
if remaining <= 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
pageSize = min(100, remaining)
|
||||||
|
query = ""
|
||||||
|
if limits.maxAgeDays:
|
||||||
|
cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
|
||||||
|
# Gmail uses YYYY/MM/DD.
|
||||||
|
query = f"after:{cutoff.strftime('%Y/%m/%d')}"
|
||||||
|
|
||||||
|
baseUrl = (
|
||||||
|
"https://gmail.googleapis.com/gmail/v1/users/me/messages"
|
||||||
|
f"?labelIds={labelId}&maxResults={pageSize}"
|
||||||
|
)
|
||||||
|
if query:
|
||||||
|
baseUrl = f"{baseUrl}&q={query}"
|
||||||
|
|
||||||
|
nextPageToken: Optional[str] = None
|
||||||
|
while (result.indexed + result.skippedDuplicate) < limits.maxMessages:
|
||||||
|
url = baseUrl if not nextPageToken else f"{baseUrl}&pageToken={nextPageToken}"
|
||||||
|
page = await googleGetFn(url)
|
||||||
|
if not isinstance(page, dict) or "error" in page:
|
||||||
|
err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
|
||||||
|
logger.warning("gmail list page error for label %s: %s", labelId, err)
|
||||||
|
result.errors.append(f"list({labelId}): {err}")
|
||||||
|
return
|
||||||
|
|
||||||
|
messageStubs = page.get("messages") or []
|
||||||
|
for stub in messageStubs:
|
||||||
|
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
|
||||||
|
break
|
||||||
|
msgId = stub.get("id")
|
||||||
|
if not msgId:
|
||||||
|
continue
|
||||||
|
detailUrl = (
|
||||||
|
f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{msgId}?format=full"
|
||||||
|
)
|
||||||
|
detail = await googleGetFn(detailUrl)
|
||||||
|
if not isinstance(detail, dict) or "error" in detail:
|
||||||
|
result.failed += 1
|
||||||
|
continue
|
||||||
|
await _ingestMessage(
|
||||||
|
googleGetFn=googleGetFn,
|
||||||
|
knowledgeService=knowledgeService,
|
||||||
|
connectionId=connectionId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
userId=userId,
|
||||||
|
labelId=labelId,
|
||||||
|
message=detail,
|
||||||
|
limits=limits,
|
||||||
|
result=result,
|
||||||
|
progressCb=progressCb,
|
||||||
|
)
|
||||||
|
|
||||||
|
nextPageToken = page.get("nextPageToken")
|
||||||
|
if not nextPageToken:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
async def _ingestMessage(
|
||||||
|
*,
|
||||||
|
googleGetFn,
|
||||||
|
knowledgeService,
|
||||||
|
connectionId: str,
|
||||||
|
mandateId: str,
|
||||||
|
userId: str,
|
||||||
|
labelId: str,
|
||||||
|
message: Dict[str, Any],
|
||||||
|
limits: GmailBootstrapLimits,
|
||||||
|
result: GmailBootstrapResult,
|
||||||
|
progressCb: Optional[Callable[[int, Optional[str]], None]],
|
||||||
|
) -> None:
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
|
||||||
|
|
||||||
|
messageId = message.get("id")
|
||||||
|
if not messageId:
|
||||||
|
result.skippedPolicy += 1
|
||||||
|
return
|
||||||
|
revision = message.get("historyId") or message.get("internalDate")
|
||||||
|
headers = _headerMap(message.get("payload") or {})
|
||||||
|
subject = headers.get("subject") or "(no subject)"
|
||||||
|
syntheticId = _syntheticMessageId(connectionId, messageId)
|
||||||
|
fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"
|
||||||
|
|
||||||
|
contentObjects = _buildContentObjects(message, limits.maxBodyChars)
|
||||||
|
try:
|
||||||
|
handle = await knowledgeService.requestIngestion(
|
||||||
|
IngestionJob(
|
||||||
|
sourceKind="gmail_message",
|
||||||
|
sourceId=syntheticId,
|
||||||
|
fileName=fileName,
|
||||||
|
mimeType="message/rfc822",
|
||||||
|
userId=userId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
contentObjects=contentObjects,
|
||||||
|
contentVersion=str(revision) if revision else None,
|
||||||
|
provenance={
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"authority": "google",
|
||||||
|
"service": "gmail",
|
||||||
|
"externalItemId": messageId,
|
||||||
|
"label": labelId,
|
||||||
|
"threadId": message.get("threadId"),
|
||||||
|
"tier": "body",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("gmail ingestion %s failed: %s", messageId, exc, exc_info=True)
|
||||||
|
result.failed += 1
|
||||||
|
result.errors.append(f"ingest({messageId}): {exc}")
|
||||||
|
return
|
||||||
|
|
||||||
|
if handle.status == "duplicate":
|
||||||
|
result.skippedDuplicate += 1
|
||||||
|
elif handle.status == "indexed":
|
||||||
|
result.indexed += 1
|
||||||
|
else:
|
||||||
|
result.failed += 1
|
||||||
|
|
||||||
|
if limits.includeAttachments:
|
||||||
|
try:
|
||||||
|
await _ingestAttachments(
|
||||||
|
googleGetFn=googleGetFn,
|
||||||
|
knowledgeService=knowledgeService,
|
||||||
|
connectionId=connectionId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
userId=userId,
|
||||||
|
message=message,
|
||||||
|
parentSyntheticId=syntheticId,
|
||||||
|
limits=limits,
|
||||||
|
result=result,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("gmail attachments %s failed: %s", messageId, exc)
|
||||||
|
result.errors.append(f"attachments({messageId}): {exc}")
|
||||||
|
|
||||||
|
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
|
||||||
|
processed = result.indexed + result.skippedDuplicate
|
||||||
|
try:
|
||||||
|
progressCb(
|
||||||
|
min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
|
||||||
|
f"gmail processed={processed}",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
logger.info(
|
||||||
|
"ingestion.connection.bootstrap.progress part=gmail processed=%d skippedDup=%d failed=%d",
|
||||||
|
processed, result.skippedDuplicate, result.failed,
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.connection.bootstrap.progress",
|
||||||
|
"part": "gmail",
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"processed": processed,
|
||||||
|
"skippedDup": result.skippedDuplicate,
|
||||||
|
"failed": result.failed,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
|
|
||||||
|
async def _ingestAttachments(
|
||||||
|
*,
|
||||||
|
googleGetFn,
|
||||||
|
knowledgeService,
|
||||||
|
connectionId: str,
|
||||||
|
mandateId: str,
|
||||||
|
userId: str,
|
||||||
|
message: Dict[str, Any],
|
||||||
|
parentSyntheticId: str,
|
||||||
|
limits: GmailBootstrapLimits,
|
||||||
|
result: GmailBootstrapResult,
|
||||||
|
) -> None:
|
||||||
|
"""Child ingestion jobs for file attachments. Skips inline images (cid: refs)."""
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
|
||||||
|
from modules.datamodels.datamodelExtraction import ExtractionOptions
|
||||||
|
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
|
||||||
|
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
|
||||||
|
ExtractorRegistry, ChunkerRegistry,
|
||||||
|
)
|
||||||
|
|
||||||
|
messageId = message.get("id") or ""
|
||||||
|
|
||||||
|
def _collectAttachmentStubs(part: Dict[str, Any], acc: List[Dict[str, Any]]) -> None:
|
||||||
|
filename = part.get("filename") or ""
|
||||||
|
body = part.get("body") or {}
|
||||||
|
attId = body.get("attachmentId")
|
||||||
|
if filename and attId:
|
||||||
|
acc.append({
|
||||||
|
"filename": filename,
|
||||||
|
"mimeType": part.get("mimeType") or "application/octet-stream",
|
||||||
|
"attachmentId": attId,
|
||||||
|
"size": int(body.get("size") or 0),
|
||||||
|
})
|
||||||
|
for sub in part.get("parts") or []:
|
||||||
|
_collectAttachmentStubs(sub, acc)
|
||||||
|
|
||||||
|
stubs: List[Dict[str, Any]] = []
|
||||||
|
_collectAttachmentStubs(message.get("payload") or {}, stubs)
|
||||||
|
if not stubs:
|
||||||
|
return
|
||||||
|
|
||||||
|
extractorRegistry = ExtractorRegistry()
|
||||||
|
chunkerRegistry = ChunkerRegistry()
|
||||||
|
|
||||||
|
for stub in stubs:
|
||||||
|
if stub["size"] and stub["size"] > limits.maxAttachmentBytes:
|
||||||
|
result.skippedPolicy += 1
|
||||||
|
continue
|
||||||
|
attUrl = (
|
||||||
|
f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{messageId}"
|
||||||
|
f"/attachments/{stub['attachmentId']}"
|
||||||
|
)
|
||||||
|
detail = await googleGetFn(attUrl)
|
||||||
|
if not isinstance(detail, dict) or "error" in detail:
|
||||||
|
result.failed += 1
|
||||||
|
continue
|
||||||
|
rawBytes = _decodeBase64Url(detail.get("data") or "")
|
||||||
|
if not rawBytes:
|
||||||
|
continue
|
||||||
|
fileName = stub["filename"]
|
||||||
|
mimeType = stub["mimeType"]
|
||||||
|
syntheticId = _syntheticAttachmentId(connectionId, messageId, stub["attachmentId"])
|
||||||
|
|
||||||
|
try:
|
||||||
|
extracted = runExtraction(
|
||||||
|
extractorRegistry, chunkerRegistry,
|
||||||
|
rawBytes, fileName, mimeType,
|
||||||
|
ExtractionOptions(mergeStrategy=None),
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("gmail attachment extract %s failed: %s", stub["attachmentId"], exc)
|
||||||
|
result.failed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
contentObjects: List[Dict[str, Any]] = []
|
||||||
|
for part in getattr(extracted, "parts", None) or []:
|
||||||
|
data = getattr(part, "data", None) or ""
|
||||||
|
if not data or not str(data).strip():
|
||||||
|
continue
|
||||||
|
typeGroup = getattr(part, "typeGroup", "text") or "text"
|
||||||
|
contentType = "text"
|
||||||
|
if typeGroup == "image":
|
||||||
|
contentType = "image"
|
||||||
|
elif typeGroup in ("binary", "container"):
|
||||||
|
contentType = "other"
|
||||||
|
contentObjects.append({
|
||||||
|
"contentObjectId": getattr(part, "id", ""),
|
||||||
|
"contentType": contentType,
|
||||||
|
"data": data,
|
||||||
|
"contextRef": {
|
||||||
|
"containerPath": fileName,
|
||||||
|
"location": getattr(part, "label", None) or "attachment",
|
||||||
|
**(getattr(part, "metadata", None) or {}),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if not contentObjects:
|
||||||
|
result.skippedPolicy += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
await knowledgeService.requestIngestion(
|
||||||
|
IngestionJob(
|
||||||
|
sourceKind="gmail_attachment",
|
||||||
|
sourceId=syntheticId,
|
||||||
|
fileName=fileName,
|
||||||
|
mimeType=mimeType,
|
||||||
|
userId=userId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
contentObjects=contentObjects,
|
||||||
|
provenance={
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"authority": "google",
|
||||||
|
"service": "gmail",
|
||||||
|
"parentId": parentSyntheticId,
|
||||||
|
"externalItemId": stub["attachmentId"],
|
||||||
|
"parentMessageId": messageId,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
result.attachmentsIndexed += 1
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("gmail attachment ingest %s failed: %s", stub["attachmentId"], exc)
|
||||||
|
result.failed += 1
|
||||||
|
|
||||||
|
|
||||||
|
def _finalizeResult(connectionId: str, result: GmailBootstrapResult, startMs: float) -> Dict[str, Any]:
|
||||||
|
durationMs = int((time.time() - startMs) * 1000)
|
||||||
|
logger.info(
|
||||||
|
"ingestion.connection.bootstrap.done part=gmail connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
|
||||||
|
connectionId,
|
||||||
|
result.indexed, result.skippedDuplicate, result.skippedPolicy,
|
||||||
|
result.attachmentsIndexed, result.failed, durationMs,
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.connection.bootstrap.done",
|
||||||
|
"part": "gmail",
|
||||||
|
"connectionId": connectionId,
|
||||||
|
"indexed": result.indexed,
|
||||||
|
"skippedDup": result.skippedDuplicate,
|
||||||
|
"skippedPolicy": result.skippedPolicy,
|
||||||
|
"attachmentsIndexed": result.attachmentsIndexed,
|
||||||
|
"failed": result.failed,
|
||||||
|
"durationMs": durationMs,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"connectionId": result.connectionId,
|
||||||
|
"indexed": result.indexed,
|
||||||
|
"skippedDuplicate": result.skippedDuplicate,
|
||||||
|
"skippedPolicy": result.skippedPolicy,
|
||||||
|
"attachmentsIndexed": result.attachmentsIndexed,
|
||||||
|
"failed": result.failed,
|
||||||
|
"durationMs": durationMs,
|
||||||
|
"errors": result.errors[:20],
|
||||||
|
}
|
||||||
203
tests/unit/services/test_bootstrap_clickup.py
Normal file
203
tests/unit/services/test_bootstrap_clickup.py
Normal file
|
|
@ -0,0 +1,203 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) 2025 Patrick Motsch
|
||||||
|
# All rights reserved.
|
||||||
|
"""Bootstrap ClickUp tests with a fake service + knowledge service.
|
||||||
|
|
||||||
|
Verifies:
|
||||||
|
- Teams → spaces → lists (folderless + folder-based) → tasks traversal.
|
||||||
|
- Each task produces a `requestIngestion` call with `sourceKind="clickup_task"`
|
||||||
|
and header + description content-objects.
|
||||||
|
- `date_updated` is forwarded as contentVersion → idempotency.
|
||||||
|
- Recency filter drops tasks older than `maxAgeDays`.
|
||||||
|
- maxWorkspaces / maxListsPerWorkspace / maxTasks caps are respected.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
|
||||||
|
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
|
||||||
|
bootstrapClickup,
|
||||||
|
ClickupBootstrapLimits,
|
||||||
|
_syntheticTaskId,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _nowMs(offsetDays: int = 0) -> str:
|
||||||
|
return str(int((time.time() + offsetDays * 86400) * 1000))
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeClickupService:
|
||||||
|
"""Records API calls; serves a canned 1-team / 1-space / 1-list / 2-task layout."""
|
||||||
|
|
||||||
|
def __init__(self, taskCount=2, oldTask=False):
|
||||||
|
self._taskCount = taskCount
|
||||||
|
self._oldTask = oldTask # when True, the second task is 400 days old
|
||||||
|
self.calls = []
|
||||||
|
|
||||||
|
async def getAuthorizedTeams(self):
|
||||||
|
self.calls.append(("getAuthorizedTeams",))
|
||||||
|
return {"teams": [{"id": "team-1", "name": "Acme"}]}
|
||||||
|
|
||||||
|
async def getSpaces(self, team_id: str):
|
||||||
|
self.calls.append(("getSpaces", team_id))
|
||||||
|
return {"spaces": [{"id": "space-1", "name": "Engineering"}]}
|
||||||
|
|
||||||
|
async def getFolderlessLists(self, space_id: str):
|
||||||
|
self.calls.append(("getFolderlessLists", space_id))
|
||||||
|
return {"lists": [{"id": "list-1", "name": "Sprint 1"}]}
|
||||||
|
|
||||||
|
async def getFolders(self, space_id: str):
|
||||||
|
self.calls.append(("getFolders", space_id))
|
||||||
|
return {"folders": [{"id": "folder-1", "name": "Subproject"}]}
|
||||||
|
|
||||||
|
async def getListsInFolder(self, folder_id: str):
|
||||||
|
self.calls.append(("getListsInFolder", folder_id))
|
||||||
|
return {"lists": [{"id": "list-2", "name": "Sub-tasks"}]}
|
||||||
|
|
||||||
|
async def getTasksInList(self, list_id: str, *, page=0, include_closed=False, subtasks=True):
|
||||||
|
self.calls.append(("getTasksInList", list_id, page, include_closed))
|
||||||
|
if page > 0:
|
||||||
|
return {"tasks": []}
|
||||||
|
tasks = []
|
||||||
|
for i in range(self._taskCount):
|
||||||
|
tid = f"{list_id}-task-{i}"
|
||||||
|
offsetDays = -400 if (self._oldTask and i == 1) else 0
|
||||||
|
tasks.append({
|
||||||
|
"id": tid,
|
||||||
|
"name": f"Task {i} of {list_id}",
|
||||||
|
"description": f"Plain description for task {i}",
|
||||||
|
"text_content": f"Rich content for task {i}",
|
||||||
|
"status": {"status": "open" if i == 0 else "closed"},
|
||||||
|
"assignees": [{"username": "alice"}],
|
||||||
|
"tags": [{"name": "urgent"}],
|
||||||
|
"date_updated": _nowMs(offsetDays),
|
||||||
|
"date_created": _nowMs(-1),
|
||||||
|
"url": f"https://app.clickup.com/t/{tid}",
|
||||||
|
})
|
||||||
|
return {"tasks": tasks}
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeKnowledgeService:
|
||||||
|
def __init__(self, duplicateIds=None):
|
||||||
|
self.calls = []
|
||||||
|
self._duplicates = duplicateIds or set()
|
||||||
|
|
||||||
|
async def requestIngestion(self, job):
|
||||||
|
self.calls.append(job)
|
||||||
|
status = "duplicate" if job.sourceId in self._duplicates else "indexed"
|
||||||
|
return SimpleNamespace(
|
||||||
|
jobId=job.sourceId, status=status, contentHash="h",
|
||||||
|
fileId=job.sourceId, index=None, error=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _adapter(svc):
|
||||||
|
return SimpleNamespace(_svc=svc)
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_walks_team_space_lists_and_tasks():
|
||||||
|
svc = _FakeClickupService(taskCount=2)
|
||||||
|
knowledge = _FakeKnowledgeService()
|
||||||
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await bootstrapClickup(
|
||||||
|
connectionId="c1",
|
||||||
|
adapter=_adapter(svc),
|
||||||
|
connection=connection,
|
||||||
|
knowledgeService=knowledge,
|
||||||
|
limits=ClickupBootstrapLimits(maxAgeDays=None),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
# 2 lists (folderless list-1 + folder's list-2) × 2 tasks each = 4 tasks
|
||||||
|
assert result["indexed"] == 4
|
||||||
|
assert result["workspaces"] == 1
|
||||||
|
assert result["lists"] == 2
|
||||||
|
sourceIds = {c.sourceId for c in knowledge.calls}
|
||||||
|
assert len(sourceIds) == 4
|
||||||
|
for job in knowledge.calls:
|
||||||
|
assert job.sourceKind == "clickup_task"
|
||||||
|
assert job.mimeType == "application/vnd.clickup.task+json"
|
||||||
|
assert job.mandateId == "m1"
|
||||||
|
assert job.provenance["connectionId"] == "c1"
|
||||||
|
assert job.provenance["authority"] == "clickup"
|
||||||
|
assert job.provenance["teamId"] == "team-1"
|
||||||
|
assert job.contentVersion # numeric millisecond string
|
||||||
|
# At least the header content-object is present.
|
||||||
|
ids = [co["contentObjectId"] for co in job.contentObjects]
|
||||||
|
assert "header" in ids
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_reports_duplicates_on_second_run():
|
||||||
|
svc = _FakeClickupService(taskCount=1)
|
||||||
|
duplicates = {
|
||||||
|
_syntheticTaskId("c1", "list-1-task-0"),
|
||||||
|
_syntheticTaskId("c1", "list-2-task-0"),
|
||||||
|
}
|
||||||
|
knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
|
||||||
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await bootstrapClickup(
|
||||||
|
connectionId="c1",
|
||||||
|
adapter=_adapter(svc),
|
||||||
|
connection=connection,
|
||||||
|
knowledgeService=knowledge,
|
||||||
|
limits=ClickupBootstrapLimits(maxAgeDays=None),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
assert result["indexed"] == 0
|
||||||
|
assert result["skippedDuplicate"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_skips_tasks_older_than_maxAgeDays():
|
||||||
|
svc = _FakeClickupService(taskCount=2, oldTask=True)
|
||||||
|
knowledge = _FakeKnowledgeService()
|
||||||
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await bootstrapClickup(
|
||||||
|
connectionId="c1",
|
||||||
|
adapter=_adapter(svc),
|
||||||
|
connection=connection,
|
||||||
|
knowledgeService=knowledge,
|
||||||
|
limits=ClickupBootstrapLimits(maxAgeDays=180),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
# 2 lists × (1 recent + 1 skipped old) = 2 indexed + 2 skippedPolicy
|
||||||
|
assert result["indexed"] == 2
|
||||||
|
assert result["skippedPolicy"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_maxTasks_caps_ingestion():
|
||||||
|
svc = _FakeClickupService(taskCount=2)
|
||||||
|
knowledge = _FakeKnowledgeService()
|
||||||
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await bootstrapClickup(
|
||||||
|
connectionId="c1",
|
||||||
|
adapter=_adapter(svc),
|
||||||
|
connection=connection,
|
||||||
|
knowledgeService=knowledge,
|
||||||
|
limits=ClickupBootstrapLimits(maxAgeDays=None, maxTasks=3),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
assert result["indexed"] == 3
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_bootstrap_walks_team_space_lists_and_tasks()
|
||||||
|
test_bootstrap_reports_duplicates_on_second_run()
|
||||||
|
test_bootstrap_skips_tasks_older_than_maxAgeDays()
|
||||||
|
test_bootstrap_maxTasks_caps_ingestion()
|
||||||
|
print("OK — bootstrapClickup tests passed")
|
||||||
225
tests/unit/services/test_bootstrap_gdrive.py
Normal file
225
tests/unit/services/test_bootstrap_gdrive.py
Normal file
|
|
@ -0,0 +1,225 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) 2025 Patrick Motsch
|
||||||
|
# All rights reserved.
|
||||||
|
"""Bootstrap Google Drive tests with a fake adapter + knowledge service.
|
||||||
|
|
||||||
|
Verifies:
|
||||||
|
- Drive walk traverses root → subfolders, respecting `maxDepth`.
|
||||||
|
- Every file triggers `requestIngestion` with `sourceKind="gdrive_item"`.
|
||||||
|
- Duplicate runs (same modifiedTime revision) report `skippedDuplicate`.
|
||||||
|
- Provenance carries `authority="google"` and the Drive file id.
|
||||||
|
- Recency filter skips files older than `maxAgeDays`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from types import SimpleNamespace
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
|
||||||
|
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import (
|
||||||
|
bootstrapGdrive,
|
||||||
|
GdriveBootstrapLimits,
|
||||||
|
_syntheticFileId,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _ExtEntry:
|
||||||
|
name: str
|
||||||
|
path: str
|
||||||
|
isFolder: bool = False
|
||||||
|
size: Optional[int] = None
|
||||||
|
mimeType: Optional[str] = None
|
||||||
|
metadata: Dict[str, Any] = None
|
||||||
|
|
||||||
|
|
||||||
|
def _today_iso(offsetDays: int = 0) -> str:
|
||||||
|
return (datetime.now(timezone.utc) + timedelta(days=offsetDays)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeDriveAdapter:
|
||||||
|
"""Minimal DriveAdapter stand-in.
|
||||||
|
|
||||||
|
Layout:
|
||||||
|
"/" (root) → 2 files + 1 folder (sub)
|
||||||
|
"/sub_id" → 1 file
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, recent_only: bool = True):
|
||||||
|
self.downloaded: List[str] = []
|
||||||
|
self._recent = _today_iso(0)
|
||||||
|
self._old = _today_iso(-400)
|
||||||
|
self._recent_only = recent_only
|
||||||
|
|
||||||
|
async def browse(self, path: str, filter=None, limit=None):
|
||||||
|
if path in ("/", "", "root"):
|
||||||
|
return [
|
||||||
|
_ExtEntry(
|
||||||
|
name="f1.txt", path="/f1", size=20,
|
||||||
|
mimeType="text/plain",
|
||||||
|
metadata={"id": "f1", "modifiedTime": self._recent},
|
||||||
|
),
|
||||||
|
_ExtEntry(
|
||||||
|
name="f2.txt", path="/f2", size=20,
|
||||||
|
mimeType="text/plain",
|
||||||
|
metadata={"id": "f2", "modifiedTime": self._recent if self._recent_only else self._old},
|
||||||
|
),
|
||||||
|
_ExtEntry(
|
||||||
|
name="Subfolder", path="/sub_id", isFolder=True,
|
||||||
|
mimeType="application/vnd.google-apps.folder",
|
||||||
|
metadata={"id": "sub_id", "modifiedTime": self._recent},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
if path == "/sub_id":
|
||||||
|
return [
|
||||||
|
_ExtEntry(
|
||||||
|
name="f3.txt", path="/f3", size=20,
|
||||||
|
mimeType="text/plain",
|
||||||
|
metadata={"id": "f3", "modifiedTime": self._recent},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
return []
|
||||||
|
|
||||||
|
async def download(self, path: str) -> bytes:
|
||||||
|
self.downloaded.append(path)
|
||||||
|
return path.encode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeKnowledgeService:
|
||||||
|
def __init__(self, duplicateIds=None):
|
||||||
|
self.calls: List[SimpleNamespace] = []
|
||||||
|
self._duplicateIds = duplicateIds or set()
|
||||||
|
|
||||||
|
async def requestIngestion(self, job):
|
||||||
|
self.calls.append(job)
|
||||||
|
status = "duplicate" if job.sourceId in self._duplicateIds else "indexed"
|
||||||
|
return SimpleNamespace(
|
||||||
|
jobId=f"{job.sourceKind}:{job.sourceId}",
|
||||||
|
status=status, contentHash="h",
|
||||||
|
fileId=job.sourceId, index=None, error=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _fakeRunExtraction(data, name, mime, options):
|
||||||
|
return SimpleNamespace(
|
||||||
|
parts=[
|
||||||
|
SimpleNamespace(
|
||||||
|
id="p1",
|
||||||
|
data=data.decode("utf-8") if isinstance(data, bytes) else str(data),
|
||||||
|
typeGroup="text",
|
||||||
|
label="page:1",
|
||||||
|
metadata={"pageIndex": 0},
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_walks_drive_and_subfolders():
|
||||||
|
adapter = _FakeDriveAdapter()
|
||||||
|
knowledge = _FakeKnowledgeService()
|
||||||
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await bootstrapGdrive(
|
||||||
|
connectionId="c1",
|
||||||
|
adapter=adapter,
|
||||||
|
connection=connection,
|
||||||
|
knowledgeService=knowledge,
|
||||||
|
runExtractionFn=_fakeRunExtraction,
|
||||||
|
limits=GdriveBootstrapLimits(maxAgeDays=None),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
assert len(knowledge.calls) == 3
|
||||||
|
sourceIds = {c.sourceId for c in knowledge.calls}
|
||||||
|
assert sourceIds == {
|
||||||
|
_syntheticFileId("c1", "f1"),
|
||||||
|
_syntheticFileId("c1", "f2"),
|
||||||
|
_syntheticFileId("c1", "f3"),
|
||||||
|
}
|
||||||
|
assert result["indexed"] == 3
|
||||||
|
assert result["skippedDuplicate"] == 0
|
||||||
|
assert adapter.downloaded == ["/f1", "/f2", "/f3"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_reports_duplicates_on_second_run():
|
||||||
|
adapter = _FakeDriveAdapter()
|
||||||
|
duplicateIds = {
|
||||||
|
_syntheticFileId("c1", "f1"),
|
||||||
|
_syntheticFileId("c1", "f2"),
|
||||||
|
_syntheticFileId("c1", "f3"),
|
||||||
|
}
|
||||||
|
knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds)
|
||||||
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await bootstrapGdrive(
|
||||||
|
connectionId="c1",
|
||||||
|
adapter=adapter,
|
||||||
|
connection=connection,
|
||||||
|
knowledgeService=knowledge,
|
||||||
|
runExtractionFn=_fakeRunExtraction,
|
||||||
|
limits=GdriveBootstrapLimits(maxAgeDays=None),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
assert result["indexed"] == 0
|
||||||
|
assert result["skippedDuplicate"] == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_skips_files_older_than_maxAgeDays():
|
||||||
|
adapter = _FakeDriveAdapter(recent_only=False) # f2 is 400 days old
|
||||||
|
knowledge = _FakeKnowledgeService()
|
||||||
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await bootstrapGdrive(
|
||||||
|
connectionId="c1",
|
||||||
|
adapter=adapter,
|
||||||
|
connection=connection,
|
||||||
|
knowledgeService=knowledge,
|
||||||
|
runExtractionFn=_fakeRunExtraction,
|
||||||
|
limits=GdriveBootstrapLimits(maxAgeDays=180),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
assert result["indexed"] == 2 # f1, f3
|
||||||
|
assert result["skippedPolicy"] == 1 # f2 filtered out
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_passes_connection_provenance():
|
||||||
|
adapter = _FakeDriveAdapter()
|
||||||
|
knowledge = _FakeKnowledgeService()
|
||||||
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await bootstrapGdrive(
|
||||||
|
connectionId="c1",
|
||||||
|
adapter=adapter,
|
||||||
|
connection=connection,
|
||||||
|
knowledgeService=knowledge,
|
||||||
|
runExtractionFn=_fakeRunExtraction,
|
||||||
|
limits=GdriveBootstrapLimits(maxAgeDays=None),
|
||||||
|
)
|
||||||
|
|
||||||
|
asyncio.run(_run())
|
||||||
|
for job in knowledge.calls:
|
||||||
|
assert job.sourceKind == "gdrive_item"
|
||||||
|
assert job.mandateId == "m1"
|
||||||
|
assert job.provenance["connectionId"] == "c1"
|
||||||
|
assert job.provenance["authority"] == "google"
|
||||||
|
assert job.provenance["service"] == "drive"
|
||||||
|
assert job.contentVersion # modifiedTime ISO string
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_bootstrap_walks_drive_and_subfolders()
|
||||||
|
test_bootstrap_reports_duplicates_on_second_run()
|
||||||
|
test_bootstrap_skips_files_older_than_maxAgeDays()
|
||||||
|
test_bootstrap_passes_connection_provenance()
|
||||||
|
print("OK — bootstrapGdrive tests passed")
|
||||||
240
tests/unit/services/test_bootstrap_gmail.py
Normal file
240
tests/unit/services/test_bootstrap_gmail.py
Normal file
|
|
@ -0,0 +1,240 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) 2025 Patrick Motsch
|
||||||
|
# All rights reserved.
|
||||||
|
"""Bootstrap Gmail tests with a fake googleGet + knowledge service.
|
||||||
|
|
||||||
|
Verifies:
|
||||||
|
- Default labels (INBOX + SENT) are traversed.
|
||||||
|
- Each message produces a requestIngestion call with sourceKind=gmail_message
|
||||||
|
and structured contentObjects (header / snippet / body).
|
||||||
|
- Pagination via `nextPageToken` is followed.
|
||||||
|
- historyId is forwarded as contentVersion → idempotency.
|
||||||
|
- MIME body extraction walks nested parts (multipart/alternative).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
|
||||||
|
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
|
||||||
|
bootstrapGmail,
|
||||||
|
GmailBootstrapLimits,
|
||||||
|
_syntheticMessageId,
|
||||||
|
_buildContentObjects,
|
||||||
|
_walkPayloadForBody,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _b64url(text: str) -> str:
|
||||||
|
return base64.urlsafe_b64encode(text.encode("utf-8")).decode("ascii").rstrip("=")
|
||||||
|
|
||||||
|
|
||||||
|
def _msg(mid: str, subject: str = "Hi", body: str = "Hello world", historyId: str = "h1"):
|
||||||
|
return {
|
||||||
|
"id": mid,
|
||||||
|
"threadId": f"thread-{mid}",
|
||||||
|
"historyId": historyId,
|
||||||
|
"internalDate": "1700000000000",
|
||||||
|
"snippet": body[:120],
|
||||||
|
"payload": {
|
||||||
|
"headers": [
|
||||||
|
{"name": "Subject", "value": subject},
|
||||||
|
{"name": "From", "value": "Alice <a@x.com>"},
|
||||||
|
{"name": "To", "value": "Bob <b@x.com>"},
|
||||||
|
{"name": "Date", "value": "Tue, 01 Jan 2025 10:00:00 +0000"},
|
||||||
|
],
|
||||||
|
"mimeType": "text/plain",
|
||||||
|
"body": {"data": _b64url(body), "size": len(body)},
|
||||||
|
"parts": [],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeGoogleGet:
|
||||||
|
"""Records URLs + returns the wired-up page or message response."""
|
||||||
|
|
||||||
|
def __init__(self, messages_by_label, paginated_label=None, page2=None):
|
||||||
|
self._messages = messages_by_label
|
||||||
|
self._paginated = paginated_label
|
||||||
|
self._page2 = page2 or []
|
||||||
|
self._served_first_page = set()
|
||||||
|
self.requested = []
|
||||||
|
|
||||||
|
async def __call__(self, url: str):
|
||||||
|
self.requested.append(url)
|
||||||
|
# List page: contains `/users/me/messages?labelIds=...`
|
||||||
|
if "/users/me/messages?" in url:
|
||||||
|
for label, msgs in self._messages.items():
|
||||||
|
if f"labelIds={label}" in url:
|
||||||
|
if (
|
||||||
|
label == self._paginated
|
||||||
|
and label not in self._served_first_page
|
||||||
|
):
|
||||||
|
self._served_first_page.add(label)
|
||||||
|
return {
|
||||||
|
"messages": [{"id": m["id"]} for m in msgs],
|
||||||
|
"nextPageToken": "token-2",
|
||||||
|
}
|
||||||
|
if label == self._paginated and "pageToken=token-2" in url:
|
||||||
|
return {
|
||||||
|
"messages": [{"id": m["id"]} for m in self._page2],
|
||||||
|
}
|
||||||
|
return {"messages": [{"id": m["id"]} for m in msgs]}
|
||||||
|
return {"messages": []}
|
||||||
|
# Detail fetch: /users/me/messages/{id}?format=full
|
||||||
|
if "/users/me/messages/" in url and "format=full" in url:
|
||||||
|
msgId = url.split("/users/me/messages/")[-1].split("?")[0]
|
||||||
|
for msgs in self._messages.values():
|
||||||
|
for m in msgs:
|
||||||
|
if m["id"] == msgId:
|
||||||
|
return m
|
||||||
|
for m in self._page2:
|
||||||
|
if m["id"] == msgId:
|
||||||
|
return m
|
||||||
|
return {"error": "not found"}
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeKnowledgeService:
|
||||||
|
def __init__(self, duplicateIds=None):
|
||||||
|
self.calls = []
|
||||||
|
self._duplicates = duplicateIds or set()
|
||||||
|
|
||||||
|
async def requestIngestion(self, job):
|
||||||
|
self.calls.append(job)
|
||||||
|
status = "duplicate" if job.sourceId in self._duplicates else "indexed"
|
||||||
|
return SimpleNamespace(
|
||||||
|
jobId=job.sourceId, status=status, contentHash="h",
|
||||||
|
fileId=job.sourceId, index=None, error=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_buildContentObjects_emits_header_snippet_body():
|
||||||
|
parts = _buildContentObjects(_msg("m1", body="Hello\nWorld"), maxBodyChars=8000)
|
||||||
|
ids = [p["contentObjectId"] for p in parts]
|
||||||
|
assert ids == ["header", "snippet", "body"]
|
||||||
|
header = parts[0]["data"]
|
||||||
|
assert "Subject: Hi" in header
|
||||||
|
assert "From: Alice <a@x.com>" in header
|
||||||
|
assert "To: Bob <b@x.com>" in header
|
||||||
|
|
||||||
|
|
||||||
|
def test_walkPayloadForBody_prefers_plain_over_html():
|
||||||
|
payload = {
|
||||||
|
"mimeType": "multipart/alternative",
|
||||||
|
"parts": [
|
||||||
|
{"mimeType": "text/plain", "body": {"data": _b64url("plain body")}},
|
||||||
|
{"mimeType": "text/html", "body": {"data": _b64url("<p>html body</p>")}},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
bodies = _walkPayloadForBody(payload)
|
||||||
|
assert bodies["text"] == "plain body"
|
||||||
|
assert bodies["html"] == "<p>html body</p>"
|
||||||
|
|
||||||
|
|
||||||
|
def test_walkPayloadForBody_falls_back_to_html():
|
||||||
|
payload = {
|
||||||
|
"mimeType": "multipart/alternative",
|
||||||
|
"parts": [
|
||||||
|
{"mimeType": "text/html", "body": {"data": _b64url("<p>only html</p>")}},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
bodies = _walkPayloadForBody(payload)
|
||||||
|
assert bodies["text"] == ""
|
||||||
|
assert "only html" in bodies["html"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_gmail_indexes_messages_from_inbox_and_sent():
|
||||||
|
fake_get = _FakeGoogleGet({
|
||||||
|
"INBOX": [_msg("m1"), _msg("m2")],
|
||||||
|
"SENT": [_msg("m3")],
|
||||||
|
})
|
||||||
|
knowledge = _FakeKnowledgeService()
|
||||||
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await bootstrapGmail(
|
||||||
|
connectionId="c1",
|
||||||
|
adapter=SimpleNamespace(_token="t"),
|
||||||
|
connection=connection,
|
||||||
|
knowledgeService=knowledge,
|
||||||
|
limits=GmailBootstrapLimits(maxAgeDays=None),
|
||||||
|
googleGetFn=fake_get,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
assert result["indexed"] == 3
|
||||||
|
sourceIds = {c.sourceId for c in knowledge.calls}
|
||||||
|
assert sourceIds == {
|
||||||
|
_syntheticMessageId("c1", "m1"),
|
||||||
|
_syntheticMessageId("c1", "m2"),
|
||||||
|
_syntheticMessageId("c1", "m3"),
|
||||||
|
}
|
||||||
|
for job in knowledge.calls:
|
||||||
|
assert job.sourceKind == "gmail_message"
|
||||||
|
assert job.mimeType == "message/rfc822"
|
||||||
|
assert job.provenance["connectionId"] == "c1"
|
||||||
|
assert job.provenance["authority"] == "google"
|
||||||
|
assert job.provenance["service"] == "gmail"
|
||||||
|
assert job.contentVersion == "h1"
|
||||||
|
assert any(co["contentObjectId"] == "header" for co in job.contentObjects)
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_gmail_follows_pagination():
|
||||||
|
fake_get = _FakeGoogleGet(
|
||||||
|
messages_by_label={"INBOX": [_msg("m1")], "SENT": []},
|
||||||
|
paginated_label="INBOX",
|
||||||
|
page2=[_msg("m2"), _msg("m3")],
|
||||||
|
)
|
||||||
|
knowledge = _FakeKnowledgeService()
|
||||||
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await bootstrapGmail(
|
||||||
|
connectionId="c1",
|
||||||
|
adapter=SimpleNamespace(_token="t"),
|
||||||
|
connection=connection,
|
||||||
|
knowledgeService=knowledge,
|
||||||
|
limits=GmailBootstrapLimits(maxAgeDays=None),
|
||||||
|
googleGetFn=fake_get,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
assert result["indexed"] == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_gmail_reports_duplicates():
|
||||||
|
fake_get = _FakeGoogleGet({"INBOX": [_msg("m1"), _msg("m2")], "SENT": []})
|
||||||
|
duplicates = {
|
||||||
|
_syntheticMessageId("c1", "m1"),
|
||||||
|
_syntheticMessageId("c1", "m2"),
|
||||||
|
}
|
||||||
|
knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
|
||||||
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await bootstrapGmail(
|
||||||
|
connectionId="c1",
|
||||||
|
adapter=SimpleNamespace(_token="t"),
|
||||||
|
connection=connection,
|
||||||
|
knowledgeService=knowledge,
|
||||||
|
limits=GmailBootstrapLimits(maxAgeDays=None),
|
||||||
|
googleGetFn=fake_get,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
assert result["indexed"] == 0
|
||||||
|
assert result["skippedDuplicate"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_buildContentObjects_emits_header_snippet_body()
|
||||||
|
test_walkPayloadForBody_prefers_plain_over_html()
|
||||||
|
test_walkPayloadForBody_falls_back_to_html()
|
||||||
|
test_bootstrap_gmail_indexes_messages_from_inbox_and_sent()
|
||||||
|
test_bootstrap_gmail_follows_pagination()
|
||||||
|
test_bootstrap_gmail_reports_duplicates()
|
||||||
|
print("OK — bootstrapGmail tests passed")
|
||||||
|
|
@ -99,17 +99,18 @@ def test_onConnectionRevoked_ignores_missing_id(monkeypatch):
|
||||||
assert seen == []
|
assert seen == []
|
||||||
|
|
||||||
|
|
||||||
def test_bootstrap_job_skips_non_pilot_authority(monkeypatch):
|
def test_bootstrap_job_skips_unsupported_authority(monkeypatch):
|
||||||
async def _run():
|
async def _run():
|
||||||
result = await consumer._bootstrapJobHandler(
|
result = await consumer._bootstrapJobHandler(
|
||||||
{"payload": {"connectionId": "c1", "authority": "google"}},
|
{"payload": {"connectionId": "c1", "authority": "slack"}},
|
||||||
lambda *_: None,
|
lambda *_: None,
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
result = asyncio.run(_run())
|
result = asyncio.run(_run())
|
||||||
assert result["skipped"] is True
|
assert result["skipped"] is True
|
||||||
assert result["authority"] == "google"
|
assert result["authority"] == "slack"
|
||||||
|
assert result["reason"] == "unsupported_authority"
|
||||||
|
|
||||||
|
|
||||||
def test_bootstrap_job_dispatches_msft_parts(monkeypatch):
|
def test_bootstrap_job_dispatches_msft_parts(monkeypatch):
|
||||||
|
|
@ -123,8 +124,6 @@ def test_bootstrap_job_dispatches_msft_parts(monkeypatch):
|
||||||
calls["ol"] += 1
|
calls["ol"] += 1
|
||||||
return {"indexed": 2}
|
return {"indexed": 2}
|
||||||
|
|
||||||
# subConnectorSync* are lazy-imported inside the handler; install fake
|
|
||||||
# modules before invoking.
|
|
||||||
fakeSharepoint = types.ModuleType("subConnectorSyncSharepoint")
|
fakeSharepoint = types.ModuleType("subConnectorSyncSharepoint")
|
||||||
fakeSharepoint.bootstrapSharepoint = _fakeSp
|
fakeSharepoint.bootstrapSharepoint = _fakeSp
|
||||||
fakeOutlook = types.ModuleType("subConnectorSyncOutlook")
|
fakeOutlook = types.ModuleType("subConnectorSyncOutlook")
|
||||||
|
|
@ -152,6 +151,70 @@ def test_bootstrap_job_dispatches_msft_parts(monkeypatch):
|
||||||
assert result["outlook"] == {"indexed": 2}
|
assert result["outlook"] == {"indexed": 2}
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_job_dispatches_google_parts(monkeypatch):
|
||||||
|
calls = {"gd": 0, "gm": 0}
|
||||||
|
|
||||||
|
async def _fakeGd(connectionId, progressCb=None):
|
||||||
|
calls["gd"] += 1
|
||||||
|
return {"indexed": 7}
|
||||||
|
|
||||||
|
async def _fakeGm(connectionId, progressCb=None):
|
||||||
|
calls["gm"] += 1
|
||||||
|
return {"indexed": 11}
|
||||||
|
|
||||||
|
fakeGdrive = types.ModuleType("subConnectorSyncGdrive")
|
||||||
|
fakeGdrive.bootstrapGdrive = _fakeGd
|
||||||
|
fakeGmail = types.ModuleType("subConnectorSyncGmail")
|
||||||
|
fakeGmail.bootstrapGmail = _fakeGm
|
||||||
|
monkeypatch.setitem(
|
||||||
|
sys.modules,
|
||||||
|
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive",
|
||||||
|
fakeGdrive,
|
||||||
|
)
|
||||||
|
monkeypatch.setitem(
|
||||||
|
sys.modules,
|
||||||
|
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail",
|
||||||
|
fakeGmail,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await consumer._bootstrapJobHandler(
|
||||||
|
{"payload": {"connectionId": "c1", "authority": "google"}},
|
||||||
|
lambda *_: None,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
assert calls == {"gd": 1, "gm": 1}
|
||||||
|
assert result["drive"] == {"indexed": 7}
|
||||||
|
assert result["gmail"] == {"indexed": 11}
|
||||||
|
|
||||||
|
|
||||||
|
def test_bootstrap_job_dispatches_clickup_part(monkeypatch):
|
||||||
|
calls = {"cu": 0}
|
||||||
|
|
||||||
|
async def _fakeCu(connectionId, progressCb=None):
|
||||||
|
calls["cu"] += 1
|
||||||
|
return {"indexed": 4}
|
||||||
|
|
||||||
|
fakeClickup = types.ModuleType("subConnectorSyncClickup")
|
||||||
|
fakeClickup.bootstrapClickup = _fakeCu
|
||||||
|
monkeypatch.setitem(
|
||||||
|
sys.modules,
|
||||||
|
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup",
|
||||||
|
fakeClickup,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
return await consumer._bootstrapJobHandler(
|
||||||
|
{"payload": {"connectionId": "c1", "authority": "clickup"}},
|
||||||
|
lambda *_: None,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asyncio.run(_run())
|
||||||
|
assert calls == {"cu": 1}
|
||||||
|
assert result["clickup"] == {"indexed": 4}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Usable without pytest fixtures for a quick smoke run.
|
# Usable without pytest fixtures for a quick smoke run.
|
||||||
class _MP:
|
class _MP:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue