# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Indicative cost estimation for a RAG bootstrap run. This is **not** a billing-grade forecast: it gives the user a back-of-the-envelope CHF figure for the worst-case full sync, so they can sanity-check before raising `maxBytes`/`maxItems`. The output always carries the underlying assumptions (`basis`) so the user can judge plausibility. Heuristic: estimatedTokens = ceil(maxBytes / CHARS_PER_TOKEN_BYTES_FACTOR) estimatedChf = estimatedTokens / 1_000_000 * EMBEDDING_CHF_PER_MTOKEN Defaults match OpenAI `text-embedding-3-small` published pricing (2026-Q2); the project convention treats provider list prices as CHF directly (see `calculatepriceCHF` in `aicorePluginOpenai.py`), so no FX conversion applies. """ from __future__ import annotations import math from typing import Any, Dict CHARS_PER_TOKEN = 4 EMBEDDING_CHF_PER_MTOKEN = 0.02 DEFAULT_TOKENS_PER_ITEM = 1500 BYTES_PER_TOKEN_TEXT_FACTOR = 4 EXTRACTABLE_FRACTION = 0.4 def estimateBootstrapCost(limits: Dict[str, int], kind: str = "files") -> Dict[str, Any]: """Return an indicative cost estimate dict for a DataSource bootstrap. Returned shape:: { "estimatedTokens": int, "estimatedChf": float, # rounded to 4 decimals "basis": { "kind": "files"|"clickup", "limits": {...}, "assumptions": { "embeddingChfPerMToken": 0.02, "charsPerToken": 4, "extractableFraction": 0.4, "tokensPerItem": 1500 # only for clickup-like item counts }, "notes": "non-binding, depends on real file content..." } } """ assumptions: Dict[str, Any] = { "embeddingChfPerMToken": EMBEDDING_CHF_PER_MTOKEN, "charsPerToken": CHARS_PER_TOKEN, } if kind == "files": maxBytes = int(limits.get("maxBytes") or 0) extractableBytes = maxBytes * EXTRACTABLE_FRACTION estimatedTokens = int(math.ceil(extractableBytes / BYTES_PER_TOKEN_TEXT_FACTOR)) assumptions["extractableFraction"] = EXTRACTABLE_FRACTION assumptions["formula"] = "ceil(maxBytes * 0.4 / 4)" elif kind == "clickup": maxTasks = int(limits.get("maxTasks") or 0) maxWorkspaces = max(1, int(limits.get("maxWorkspaces") or 1)) estimatedTokens = maxTasks * maxWorkspaces * DEFAULT_TOKENS_PER_ITEM assumptions["tokensPerItem"] = DEFAULT_TOKENS_PER_ITEM assumptions["formula"] = "maxTasks * maxWorkspaces * 1500" else: estimatedTokens = 0 assumptions["formula"] = "unknown kind, returning zero" estimatedChf = round(estimatedTokens / 1_000_000 * EMBEDDING_CHF_PER_MTOKEN, 4) return { "estimatedTokens": estimatedTokens, "estimatedChf": estimatedChf, "basis": { "kind": kind, "limits": dict(limits), "assumptions": assumptions, "notes": ( "Indicative only. Actual cost depends on file types, extractable text " "ratio, dedup hit-rate, retries, and current embedding model pricing." ), }, }