# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ClickUp bootstrap for the unified knowledge ingestion lane. ClickUp tasks are ingested as *virtual documents* — we never download file bytes. Each task becomes a `sourceKind="clickup_task"` IngestionJob whose `contentObjects` carry a summary header (name + status + metadata) and the task description / text content so retrieval finds them without a live API call. Hierarchy traversal: workspace (team) → spaces → folders / folderless lists → tasks. We cap the fan-out with `maxWorkspaces` / `maxListsPerWorkspace` / `maxTasks` and skip tasks older than `maxAgeDays` (default 180 d). Idempotency: `date_updated` from the ClickUp task payload is a millisecond timestamp and strictly monotonic per revision — used as `contentVersion`. """ from __future__ import annotations import hashlib import logging import time from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone from typing import Any, Callable, Dict, List, Optional logger = logging.getLogger(__name__) MAX_TASKS_DEFAULT = 500 MAX_WORKSPACES_DEFAULT = 3 MAX_LISTS_PER_WORKSPACE_DEFAULT = 20 MAX_DESCRIPTION_CHARS_DEFAULT = 8000 MAX_AGE_DAYS_DEFAULT = 180 @dataclass class ClickupBootstrapLimits: maxTasks: int = MAX_TASKS_DEFAULT maxWorkspaces: int = MAX_WORKSPACES_DEFAULT maxListsPerWorkspace: int = MAX_LISTS_PER_WORKSPACE_DEFAULT maxDescriptionChars: int = MAX_DESCRIPTION_CHARS_DEFAULT # Only ingest tasks updated within the last N days. None disables filter. maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT # Include closed/archived tasks if they still meet the recency filter. # ClickUp `closed` tasks often carry the most useful RAG context # ("why was this shipped the way it was?"). includeClosed: bool = True # Pass-through to IngestionJob.neutralize neutralize: bool = False # Content scope: "titles" | "title_description" | "with_comments" clickupScope: str = "title_description" @dataclass class ClickupBootstrapResult: connectionId: str indexed: int = 0 skippedDuplicate: int = 0 skippedPolicy: int = 0 failed: int = 0 workspaces: int = 0 lists: int = 0 errors: List[str] = field(default_factory=list) def _syntheticTaskId(connectionId: str, taskId: str) -> str: token = hashlib.sha256(f"{connectionId}:{taskId}".encode("utf-8")).hexdigest()[:16] return f"cu:{connectionId[:8]}:{token}" def _truncate(value: Any, limit: int) -> str: text = str(value or "").strip() if not text: return "" if len(text) <= limit: return text return text[:limit].rstrip() + "\n[truncated]" def _isRecent(dateUpdatedMs: Any, maxAgeDays: Optional[int]) -> bool: if not maxAgeDays: return True if not dateUpdatedMs: return True try: ts = datetime.fromtimestamp(int(dateUpdatedMs) / 1000.0, tz=timezone.utc) except Exception: return True cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays) return ts >= cutoff def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -> List[Dict[str, Any]]: """Header (name/status/metadata) + optional description + text_content. `limits.clickupScope` controls how much is embedded: - "titles": task name + status metadata only - "title_description": header + description / text_content (default) - "with_comments": header + description + text_content (comments themselves are not yet fetched in v1) """ name = task.get("name") or f"Task {task.get('id', '')}" status = ((task.get("status") or {}).get("status")) or "" assignees = ", ".join( filter(None, [ (a.get("username") or a.get("email") or "") for a in (task.get("assignees") or []) ]) ) tags = ", ".join(filter(None, [t.get("name", "") for t in (task.get("tags") or [])])) listInfo = task.get("list") or {} folderInfo = task.get("folder") or {} spaceInfo = task.get("space") or {} dueMs = task.get("due_date") dueIso = "" if dueMs: try: dueIso = datetime.fromtimestamp(int(dueMs) / 1000.0, tz=timezone.utc).strftime("%Y-%m-%d") except Exception: dueIso = "" headerLines = [ f"Task: {name}", f"Status: {status}" if status else "", f"List: {listInfo.get('name', '')}" if listInfo else "", f"Folder: {folderInfo.get('name', '')}" if folderInfo else "", f"Space: {spaceInfo.get('name', '')}" if spaceInfo else "", f"Assignees: {assignees}" if assignees else "", f"Tags: {tags}" if tags else "", f"Due: {dueIso}" if dueIso else "", f"Url: {task.get('url', '')}" if task.get("url") else "", ] header = "\n".join(line for line in headerLines if line) parts: List[Dict[str, Any]] = [{ "contentObjectId": "header", "contentType": "text", "data": header, "contextRef": {"part": "header"}, }] scope = getattr(limits, "clickupScope", "title_description") if scope in ("title_description", "with_comments"): description = _truncate(task.get("description"), limits.maxDescriptionChars) if description: parts.append({ "contentObjectId": "description", "contentType": "text", "data": description, "contextRef": {"part": "description"}, }) # text_content is ClickUp's rendered-markdown version; include if it adds # something beyond the plain description (common for bullet lists, checklists). textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars) if textContent and textContent != description: parts.append({ "contentObjectId": "text_content", "contentType": "text", "data": textContent, "contextRef": {"part": "text_content"}, }) return parts async def bootstrapClickup( connectionId: str, *, progressCb: Optional[Callable[[int, Optional[str]], None]] = None, adapter: Any = None, connection: Any = None, knowledgeService: Any = None, limits: Optional[ClickupBootstrapLimits] = None, ) -> Dict[str, Any]: """Walk workspaces → lists → tasks and ingest each task as a virtual doc.""" from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs prefs = loadConnectionPrefs(connectionId) if not limits: limits = ClickupBootstrapLimits( maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, neutralize=prefs.neutralizeBeforeEmbed, clickupScope=prefs.clickupScope, ) startMs = time.time() result = ClickupBootstrapResult(connectionId=connectionId) logger.info( "ingestion.connection.bootstrap.started part=clickup connectionId=%s", connectionId, extra={ "event": "ingestion.connection.bootstrap.started", "part": "clickup", "connectionId": connectionId, }, ) if adapter is None or knowledgeService is None or connection is None: adapter, connection, knowledgeService = await _resolveDependencies(connectionId) mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" svc = getattr(adapter, "_svc", None) if svc is None: result.errors.append("adapter missing _svc instance") return _finalizeResult(connectionId, result, startMs) try: teamsResp = await svc.getAuthorizedTeams() except Exception as exc: logger.error("clickup team discovery failed for %s: %s", connectionId, exc, exc_info=True) result.errors.append(f"teams: {exc}") return _finalizeResult(connectionId, result, startMs) teams = (teamsResp or {}).get("teams") or [] for team in teams[: limits.maxWorkspaces]: if result.indexed + result.skippedDuplicate >= limits.maxTasks: break teamId = str(team.get("id", "") or "") if not teamId: continue result.workspaces += 1 try: await _walkTeam( svc=svc, knowledgeService=knowledgeService, connectionId=connectionId, mandateId=mandateId, userId=userId, team=team, limits=limits, result=result, progressCb=progressCb, ) except Exception as exc: logger.error("clickup team %s walk failed: %s", teamId, exc, exc_info=True) result.errors.append(f"team({teamId}): {exc}") return _finalizeResult(connectionId, result, startMs) async def _resolveDependencies(connectionId: str): from modules.interfaces.interfaceDbApp import getRootInterface from modules.auth import TokenManager from modules.connectors.providerClickup.connectorClickup import ClickupConnector from modules.serviceCenter import getService from modules.serviceCenter.context import ServiceCenterContext from modules.security.rootAccess import getRootUser rootInterface = getRootInterface() connection = rootInterface.getUserConnectionById(connectionId) if connection is None: raise ValueError(f"UserConnection not found: {connectionId}") token = TokenManager().getFreshToken(connectionId) if not token or not token.tokenAccess: raise ValueError(f"No valid token for connection {connectionId}") provider = ClickupConnector(connection, token.tokenAccess) adapter = provider.getServiceAdapter("clickup") rootUser = getRootUser() ctx = ServiceCenterContext( user=rootUser, mandate_id=str(getattr(connection, "mandateId", "") or ""), ) knowledgeService = getService("knowledge", ctx) return adapter, connection, knowledgeService async def _walkTeam( *, svc, knowledgeService, connectionId: str, mandateId: str, userId: str, team: Dict[str, Any], limits: ClickupBootstrapLimits, result: ClickupBootstrapResult, progressCb: Optional[Callable[[int, Optional[str]], None]], ) -> None: teamId = str(team.get("id", "") or "") spacesResp = await svc.getSpaces(teamId) spaces = (spacesResp or {}).get("spaces") or [] listsCollected: List[Dict[str, Any]] = [] for space in spaces: if len(listsCollected) >= limits.maxListsPerWorkspace: break spaceId = str(space.get("id", "") or "") if not spaceId: continue # Folderless lists directly under the space folderless = await svc.getFolderlessLists(spaceId) for lst in (folderless or {}).get("lists") or []: if len(listsCollected) >= limits.maxListsPerWorkspace: break listsCollected.append({**lst, "_space": space}) # Lists inside folders foldersResp = await svc.getFolders(spaceId) for folder in (foldersResp or {}).get("folders") or []: if len(listsCollected) >= limits.maxListsPerWorkspace: break folderId = str(folder.get("id", "") or "") if not folderId: continue folderLists = await svc.getListsInFolder(folderId) for lst in (folderLists or {}).get("lists") or []: if len(listsCollected) >= limits.maxListsPerWorkspace: break listsCollected.append({**lst, "_space": space, "_folder": folder}) for lst in listsCollected: if result.indexed + result.skippedDuplicate >= limits.maxTasks: return result.lists += 1 await _walkList( svc=svc, knowledgeService=knowledgeService, connectionId=connectionId, mandateId=mandateId, userId=userId, teamId=teamId, lst=lst, limits=limits, result=result, progressCb=progressCb, ) async def _walkList( *, svc, knowledgeService, connectionId: str, mandateId: str, userId: str, teamId: str, lst: Dict[str, Any], limits: ClickupBootstrapLimits, result: ClickupBootstrapResult, progressCb: Optional[Callable[[int, Optional[str]], None]], ) -> None: listId = str(lst.get("id", "") or "") if not listId: return page = 0 while result.indexed + result.skippedDuplicate < limits.maxTasks: resp = await svc.getTasksInList( listId, page=page, include_closed=limits.includeClosed, subtasks=True, ) if isinstance(resp, dict) and resp.get("error"): logger.warning("clickup tasks list=%s page=%d error: %s", listId, page, resp.get("error")) result.errors.append(f"list({listId}): {resp.get('error')}") return tasks = (resp or {}).get("tasks") or [] if not tasks: return for task in tasks: if result.indexed + result.skippedDuplicate >= limits.maxTasks: return if not _isRecent(task.get("date_updated"), limits.maxAgeDays): result.skippedPolicy += 1 continue # Inject the list/folder/space metadata we already loaded. task["list"] = task.get("list") or {"id": listId, "name": lst.get("name")} task["folder"] = task.get("folder") or lst.get("_folder") or {} task["space"] = task.get("space") or lst.get("_space") or {} await _ingestTask( knowledgeService=knowledgeService, connectionId=connectionId, mandateId=mandateId, userId=userId, teamId=teamId, task=task, limits=limits, result=result, progressCb=progressCb, ) if len(tasks) < 100: # ClickUp page-size hint: fewer than 100 => last page return page += 1 async def _ingestTask( *, knowledgeService, connectionId: str, mandateId: str, userId: str, teamId: str, task: Dict[str, Any], limits: ClickupBootstrapLimits, result: ClickupBootstrapResult, progressCb: Optional[Callable[[int, Optional[str]], None]], ) -> None: from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob taskId = str(task.get("id", "") or "") if not taskId: result.skippedPolicy += 1 return revision = str(task.get("date_updated") or task.get("date_created") or "") name = task.get("name") or f"Task {taskId}" syntheticId = _syntheticTaskId(connectionId, taskId) fileName = f"{name[:80].strip() or taskId}.task.json" contentObjects = _buildContentObjects(task, limits) try: handle = await knowledgeService.requestIngestion( IngestionJob( sourceKind="clickup_task", sourceId=syntheticId, fileName=fileName, mimeType="application/vnd.clickup.task+json", userId=userId, mandateId=mandateId, contentObjects=contentObjects, contentVersion=revision or None, neutralize=limits.neutralize, provenance={ "connectionId": connectionId, "authority": "clickup", "service": "clickup", "externalItemId": taskId, "teamId": teamId, "listId": ((task.get("list") or {}).get("id")), "spaceId": ((task.get("space") or {}).get("id")), "url": task.get("url"), "status": ((task.get("status") or {}).get("status")), "tier": limits.clickupScope, }, ) ) except Exception as exc: logger.error("clickup ingestion %s failed: %s", taskId, exc, exc_info=True) result.failed += 1 result.errors.append(f"ingest({taskId}): {exc}") return if handle.status == "duplicate": result.skippedDuplicate += 1 elif handle.status == "indexed": result.indexed += 1 else: result.failed += 1 if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: processed = result.indexed + result.skippedDuplicate try: progressCb( min(90, 10 + int(80 * processed / max(1, limits.maxTasks))), f"clickup processed={processed}", ) except Exception: pass logger.info( "ingestion.connection.bootstrap.progress part=clickup processed=%d skippedDup=%d failed=%d", processed, result.skippedDuplicate, result.failed, extra={ "event": "ingestion.connection.bootstrap.progress", "part": "clickup", "connectionId": connectionId, "processed": processed, "skippedDup": result.skippedDuplicate, "failed": result.failed, }, ) def _finalizeResult(connectionId: str, result: ClickupBootstrapResult, startMs: float) -> Dict[str, Any]: durationMs = int((time.time() - startMs) * 1000) logger.info( "ingestion.connection.bootstrap.done part=clickup connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d workspaces=%d lists=%d durationMs=%d", connectionId, result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed, result.workspaces, result.lists, durationMs, extra={ "event": "ingestion.connection.bootstrap.done", "part": "clickup", "connectionId": connectionId, "indexed": result.indexed, "skippedDup": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "failed": result.failed, "workspaces": result.workspaces, "lists": result.lists, "durationMs": durationMs, }, ) return { "connectionId": result.connectionId, "indexed": result.indexed, "skippedDuplicate": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "failed": result.failed, "workspaces": result.workspaces, "lists": result.lists, "durationMs": durationMs, "errors": result.errors[:20], }