# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Centralized RAG bootstrap limits + DataSource-scoped resolution. The original walkers (SharePoint, kDrive, gDrive, ClickUp) each carried their own module-level `MAX_*_DEFAULT` constants and silently stopped indexing once they were exceeded. That made it impossible for a user with a 500 MB folder to override the 200 MB cap without a code change. This module is the single source of truth for two things: 1. The canonical default budget per source kind (`FILES_LIMITS_DEFAULT`, `CLICKUP_LIMITS_DEFAULT`). Walkers fall back to these when a DataSource has no `settings.ragLimits` yet. 2. The pure read/lazy-fill helpers that walkers and the API use to merge a DataSource's stored settings with the defaults. No override layers, no resolver chain: what is in `DataSource.settings.ragLimits` is what the walker uses. Lazy fill: the first time a DataSource is processed, the defaults are written to its `settings.ragLimits` so the UI shows real values immediately, even if the user has never opened the settings modal. """ from __future__ import annotations import logging from typing import Any, Dict, Optional logger = logging.getLogger(__name__) FILES_LIMITS_DEFAULT: Dict[str, int] = { "maxItems": 500, "maxBytes": 200 * 1024 * 1024, "maxFileSize": 25 * 1024 * 1024, "maxDepth": 4, } CLICKUP_LIMITS_DEFAULT: Dict[str, int] = { "maxTasks": 500, "maxWorkspaces": 3, "maxListsPerWorkspace": 20, } _LIMITS_BY_KIND: Dict[str, Dict[str, int]] = { "files": FILES_LIMITS_DEFAULT, "clickup": CLICKUP_LIMITS_DEFAULT, } def getDefaults(kind: str) -> Dict[str, int]: """Return a fresh copy of the default budget for the given walker kind. `kind` is either "files" (Sharepoint, kDrive, gDrive) or "clickup". Returning a copy lets callers mutate the result safely. """ defaults = _LIMITS_BY_KIND.get(kind) if defaults is None: raise ValueError(f"Unknown RAG limit kind: {kind!r}") return dict(defaults) def getStoredOverrides(dataSource: Optional[Dict[str, Any]], kind: str) -> Dict[str, int]: """Return ONLY the limits explicitly set on `dataSource.settings.ragLimits`. Missing keys are NOT filled with defaults — that is the caller's job (so a programmatically supplied `limits=` from a Caller still wins when the DataSource has no override). Pure read, no DB writes. """ if not isinstance(dataSource, dict): return {} settings = dataSource.get("settings") or {} if not isinstance(settings, dict): return {} stored = settings.get("ragLimits") if not isinstance(stored, dict): return {} allowed = set(_LIMITS_BY_KIND.get(kind, {}).keys()) out: Dict[str, int] = {} for key, raw in stored.items(): if key not in allowed or raw is None: continue try: out[key] = int(raw) except (TypeError, ValueError): logger.warning( "Ignoring non-int ragLimits[%s]=%r on DataSource %s", key, raw, dataSource.get("id"), ) return out def getRagLimits(dataSource: Optional[Dict[str, Any]], kind: str) -> Dict[str, int]: """Effective RAG limits for the API/cost-estimate use-case. Stored overrides win over `getDefaults(kind)`. Walkers should NOT use this function — they should pass their own caller-limits as the fallback so that a runtime-supplied `limits=` parameter is honoured (see `getStoredOverrides`). """ base = getDefaults(kind) base.update(getStoredOverrides(dataSource, kind)) return base