88 lines
3.1 KiB
Python
88 lines
3.1 KiB
Python
# Copyright (c) 2026 PowerOn AG
|
|
# All rights reserved.
|
|
"""Indicative cost estimation for a RAG bootstrap run.
|
|
|
|
This is **not** a billing-grade forecast: it gives the user a back-of-the-envelope
|
|
CHF figure for the worst-case full sync, so they can sanity-check before raising
|
|
`maxBytes`/`maxItems`. The output always carries the underlying assumptions
|
|
(`basis`) so the user can judge plausibility.
|
|
|
|
Heuristic:
|
|
estimatedTokens = ceil(maxBytes / CHARS_PER_TOKEN_BYTES_FACTOR)
|
|
estimatedChf = estimatedTokens / 1_000_000 * EMBEDDING_CHF_PER_MTOKEN
|
|
|
|
Defaults match OpenAI `text-embedding-3-small` published pricing (2026-Q2);
|
|
the project convention treats provider list prices as CHF directly (see
|
|
`calculatepriceCHF` in `aicorePluginOpenai.py`), so no FX conversion applies.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
from typing import Any, Dict
|
|
|
|
|
|
CHARS_PER_TOKEN = 4
|
|
EMBEDDING_CHF_PER_MTOKEN = 0.02
|
|
DEFAULT_TOKENS_PER_ITEM = 1500
|
|
BYTES_PER_TOKEN_TEXT_FACTOR = 4
|
|
EXTRACTABLE_FRACTION = 0.4
|
|
|
|
|
|
def estimateBootstrapCost(limits: Dict[str, int], kind: str = "files") -> Dict[str, Any]:
|
|
"""Return an indicative cost estimate dict for a DataSource bootstrap.
|
|
|
|
Returned shape::
|
|
|
|
{
|
|
"estimatedTokens": int,
|
|
"estimatedChf": float, # rounded to 4 decimals
|
|
"basis": {
|
|
"kind": "files"|"clickup",
|
|
"limits": {...},
|
|
"assumptions": {
|
|
"embeddingChfPerMToken": 0.02,
|
|
"charsPerToken": 4,
|
|
"extractableFraction": 0.4,
|
|
"tokensPerItem": 1500 # only for clickup-like item counts
|
|
},
|
|
"notes": "non-binding, depends on real file content..."
|
|
}
|
|
}
|
|
"""
|
|
assumptions: Dict[str, Any] = {
|
|
"embeddingChfPerMToken": EMBEDDING_CHF_PER_MTOKEN,
|
|
"charsPerToken": CHARS_PER_TOKEN,
|
|
}
|
|
|
|
if kind == "files":
|
|
maxBytes = int(limits.get("maxBytes") or 0)
|
|
extractableBytes = maxBytes * EXTRACTABLE_FRACTION
|
|
estimatedTokens = int(math.ceil(extractableBytes / BYTES_PER_TOKEN_TEXT_FACTOR))
|
|
assumptions["extractableFraction"] = EXTRACTABLE_FRACTION
|
|
assumptions["formula"] = "ceil(maxBytes * 0.4 / 4)"
|
|
elif kind == "clickup":
|
|
maxTasks = int(limits.get("maxTasks") or 0)
|
|
maxWorkspaces = max(1, int(limits.get("maxWorkspaces") or 1))
|
|
estimatedTokens = maxTasks * maxWorkspaces * DEFAULT_TOKENS_PER_ITEM
|
|
assumptions["tokensPerItem"] = DEFAULT_TOKENS_PER_ITEM
|
|
assumptions["formula"] = "maxTasks * maxWorkspaces * 1500"
|
|
else:
|
|
estimatedTokens = 0
|
|
assumptions["formula"] = "unknown kind, returning zero"
|
|
|
|
estimatedChf = round(estimatedTokens / 1_000_000 * EMBEDDING_CHF_PER_MTOKEN, 4)
|
|
|
|
return {
|
|
"estimatedTokens": estimatedTokens,
|
|
"estimatedChf": estimatedChf,
|
|
"basis": {
|
|
"kind": kind,
|
|
"limits": dict(limits),
|
|
"assumptions": assumptions,
|
|
"notes": (
|
|
"Indicative only. Actual cost depends on file types, extractable text "
|
|
"ratio, dedup hit-rate, retries, and current embedding model pricing."
|
|
),
|
|
},
|
|
}
|