platform-core/modules/serviceCenter/services/serviceKnowledge/ragLimits.py
ValueOn AG bc7c6fe27c
Some checks failed
Deploy Plattform-Core (Int) / test (push) Failing after 13s
Deploy Plattform-Core (Int) / deploy (push) Has been skipped
elimination of technical issues (imports)
2026-06-06 00:32:45 +02:00

107 lines
3.6 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Centralized RAG bootstrap limits + DataSource-scoped resolution.
The original walkers (SharePoint, kDrive, gDrive, ClickUp) each carried their
own module-level `MAX_*_DEFAULT` constants and silently stopped indexing once
they were exceeded. That made it impossible for a user with a 500 MB folder to
override the 200 MB cap without a code change.
This module is the single source of truth for two things:
1. The canonical default budget per source kind (`FILES_LIMITS_DEFAULT`,
`CLICKUP_LIMITS_DEFAULT`). Walkers fall back to these when a DataSource has
no `settings.ragLimits` yet.
2. The pure read/lazy-fill helpers that walkers and the API use to merge a
DataSource's stored settings with the defaults. No override layers, no
resolver chain: what is in `DataSource.settings.ragLimits` is what the
walker uses.
Lazy fill: the first time a DataSource is processed, the defaults are written
to its `settings.ragLimits` so the UI shows real values immediately, even if
the user has never opened the settings modal.
"""
from __future__ import annotations
import logging
from typing import Any, Dict, Optional
logger = logging.getLogger(__name__)
FILES_LIMITS_DEFAULT: Dict[str, int] = {
"maxItems": 500,
"maxBytes": 200 * 1024 * 1024,
"maxFileSize": 25 * 1024 * 1024,
"maxDepth": 4,
}
CLICKUP_LIMITS_DEFAULT: Dict[str, int] = {
"maxTasks": 500,
"maxWorkspaces": 3,
"maxListsPerWorkspace": 20,
}
_LIMITS_BY_KIND: Dict[str, Dict[str, int]] = {
"files": FILES_LIMITS_DEFAULT,
"clickup": CLICKUP_LIMITS_DEFAULT,
}
def getDefaults(kind: str) -> Dict[str, int]:
"""Return a fresh copy of the default budget for the given walker kind.
`kind` is either "files" (Sharepoint, kDrive, gDrive) or "clickup".
Returning a copy lets callers mutate the result safely.
"""
defaults = _LIMITS_BY_KIND.get(kind)
if defaults is None:
raise ValueError(f"Unknown RAG limit kind: {kind!r}")
return dict(defaults)
def getStoredOverrides(dataSource: Optional[Dict[str, Any]], kind: str) -> Dict[str, int]:
"""Return ONLY the limits explicitly set on `dataSource.settings.ragLimits`.
Missing keys are NOT filled with defaults — that is the caller's job (so
a programmatically supplied `limits=` from a Caller still wins when the
DataSource has no override). Pure read, no DB writes.
"""
if not isinstance(dataSource, dict):
return {}
settings = dataSource.get("settings") or {}
if not isinstance(settings, dict):
return {}
stored = settings.get("ragLimits")
if not isinstance(stored, dict):
return {}
allowed = set(_LIMITS_BY_KIND.get(kind, {}).keys())
out: Dict[str, int] = {}
for key, raw in stored.items():
if key not in allowed or raw is None:
continue
try:
out[key] = int(raw)
except (TypeError, ValueError):
logger.warning(
"Ignoring non-int ragLimits[%s]=%r on DataSource %s",
key, raw, dataSource.get("id"),
)
return out
def getRagLimits(dataSource: Optional[Dict[str, Any]], kind: str) -> Dict[str, int]:
"""Effective RAG limits for the API/cost-estimate use-case.
Stored overrides win over `getDefaults(kind)`. Walkers should NOT use this
function — they should pass their own caller-limits as the fallback so that
a runtime-supplied `limits=` parameter is honoured (see `getStoredOverrides`).
"""
base = getDefaults(kind)
base.update(getStoredOverrides(dataSource, kind))
return base