107 lines
3.6 KiB
Python
107 lines
3.6 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Centralized RAG bootstrap limits + DataSource-scoped resolution.
|
|
|
|
The original walkers (SharePoint, kDrive, gDrive, ClickUp) each carried their
|
|
own module-level `MAX_*_DEFAULT` constants and silently stopped indexing once
|
|
they were exceeded. That made it impossible for a user with a 500 MB folder to
|
|
override the 200 MB cap without a code change.
|
|
|
|
This module is the single source of truth for two things:
|
|
|
|
1. The canonical default budget per source kind (`FILES_LIMITS_DEFAULT`,
|
|
`CLICKUP_LIMITS_DEFAULT`). Walkers fall back to these when a DataSource has
|
|
no `settings.ragLimits` yet.
|
|
|
|
2. The pure read/lazy-fill helpers that walkers and the API use to merge a
|
|
DataSource's stored settings with the defaults. No override layers, no
|
|
resolver chain: what is in `DataSource.settings.ragLimits` is what the
|
|
walker uses.
|
|
|
|
Lazy fill: the first time a DataSource is processed, the defaults are written
|
|
to its `settings.ragLimits` so the UI shows real values immediately, even if
|
|
the user has never opened the settings modal.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Any, Dict, Optional
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
FILES_LIMITS_DEFAULT: Dict[str, int] = {
|
|
"maxItems": 500,
|
|
"maxBytes": 200 * 1024 * 1024,
|
|
"maxFileSize": 25 * 1024 * 1024,
|
|
"maxDepth": 4,
|
|
}
|
|
|
|
|
|
CLICKUP_LIMITS_DEFAULT: Dict[str, int] = {
|
|
"maxTasks": 500,
|
|
"maxWorkspaces": 3,
|
|
"maxListsPerWorkspace": 20,
|
|
}
|
|
|
|
|
|
_LIMITS_BY_KIND: Dict[str, Dict[str, int]] = {
|
|
"files": FILES_LIMITS_DEFAULT,
|
|
"clickup": CLICKUP_LIMITS_DEFAULT,
|
|
}
|
|
|
|
|
|
def getDefaults(kind: str) -> Dict[str, int]:
|
|
"""Return a fresh copy of the default budget for the given walker kind.
|
|
|
|
`kind` is either "files" (Sharepoint, kDrive, gDrive) or "clickup".
|
|
Returning a copy lets callers mutate the result safely.
|
|
"""
|
|
defaults = _LIMITS_BY_KIND.get(kind)
|
|
if defaults is None:
|
|
raise ValueError(f"Unknown RAG limit kind: {kind!r}")
|
|
return dict(defaults)
|
|
|
|
|
|
def getStoredOverrides(dataSource: Optional[Dict[str, Any]], kind: str) -> Dict[str, int]:
|
|
"""Return ONLY the limits explicitly set on `dataSource.settings.ragLimits`.
|
|
|
|
Missing keys are NOT filled with defaults — that is the caller's job (so
|
|
a programmatically supplied `limits=` from a Caller still wins when the
|
|
DataSource has no override). Pure read, no DB writes.
|
|
"""
|
|
if not isinstance(dataSource, dict):
|
|
return {}
|
|
settings = dataSource.get("settings") or {}
|
|
if not isinstance(settings, dict):
|
|
return {}
|
|
stored = settings.get("ragLimits")
|
|
if not isinstance(stored, dict):
|
|
return {}
|
|
allowed = set(_LIMITS_BY_KIND.get(kind, {}).keys())
|
|
out: Dict[str, int] = {}
|
|
for key, raw in stored.items():
|
|
if key not in allowed or raw is None:
|
|
continue
|
|
try:
|
|
out[key] = int(raw)
|
|
except (TypeError, ValueError):
|
|
logger.warning(
|
|
"Ignoring non-int ragLimits[%s]=%r on DataSource %s",
|
|
key, raw, dataSource.get("id"),
|
|
)
|
|
return out
|
|
|
|
|
|
def getRagLimits(dataSource: Optional[Dict[str, Any]], kind: str) -> Dict[str, int]:
|
|
"""Effective RAG limits for the API/cost-estimate use-case.
|
|
|
|
Stored overrides win over `getDefaults(kind)`. Walkers should NOT use this
|
|
function — they should pass their own caller-limits as the fallback so that
|
|
a runtime-supplied `limits=` parameter is honoured (see `getStoredOverrides`).
|
|
"""
|
|
base = getDefaults(kind)
|
|
base.update(getStoredOverrides(dataSource, kind))
|
|
return base
|