platform-core/modules/serviceCenter/services/serviceKnowledge/costEstimate.py

# Copyright (c) 2026 PowerOn AG
# All rights reserved.
"""Indicative cost estimation for a RAG bootstrap run.

This is **not** a billing-grade forecast: it gives the user a back-of-the-envelope
CHF figure for the worst-case full sync, so they can sanity-check before raising
`maxBytes`/`maxItems`. The output always carries the underlying assumptions
(`basis`) so the user can judge plausibility.

Heuristic:
    estimatedTokens = ceil(maxBytes / CHARS_PER_TOKEN_BYTES_FACTOR)
    estimatedChf    = estimatedTokens / 1_000_000 * EMBEDDING_CHF_PER_MTOKEN

Defaults match OpenAI `text-embedding-3-small` published pricing (2026-Q2);
the project convention treats provider list prices as CHF directly (see
`calculatepriceCHF` in `aicorePluginOpenai.py`), so no FX conversion applies.
"""

from __future__ import annotations

import math
from typing import Any, Dict


CHARS_PER_TOKEN = 4
EMBEDDING_CHF_PER_MTOKEN = 0.02
DEFAULT_TOKENS_PER_ITEM = 1500
BYTES_PER_TOKEN_TEXT_FACTOR = 4
EXTRACTABLE_FRACTION = 0.4


def estimateBootstrapCost(limits: Dict[str, int], kind: str = "files") -> Dict[str, Any]:
    """Return an indicative cost estimate dict for a DataSource bootstrap.

    Returned shape::

        {
          "estimatedTokens": int,
          "estimatedChf": float,    # rounded to 4 decimals
          "basis": {
            "kind": "files"|"clickup",
            "limits": {...},
            "assumptions": {
              "embeddingChfPerMToken": 0.02,
              "charsPerToken": 4,
              "extractableFraction": 0.4,
              "tokensPerItem": 1500     # only for clickup-like item counts
            },
            "notes": "non-binding, depends on real file content..."
          }
        }
    """
    assumptions: Dict[str, Any] = {
        "embeddingChfPerMToken": EMBEDDING_CHF_PER_MTOKEN,
        "charsPerToken": CHARS_PER_TOKEN,
    }

    if kind == "files":
        maxBytes = int(limits.get("maxBytes") or 0)
        extractableBytes = maxBytes * EXTRACTABLE_FRACTION
        estimatedTokens = int(math.ceil(extractableBytes / BYTES_PER_TOKEN_TEXT_FACTOR))
        assumptions["extractableFraction"] = EXTRACTABLE_FRACTION
        assumptions["formula"] = "ceil(maxBytes * 0.4 / 4)"
    elif kind == "clickup":
        maxTasks = int(limits.get("maxTasks") or 0)
        maxWorkspaces = max(1, int(limits.get("maxWorkspaces") or 1))
        estimatedTokens = maxTasks * maxWorkspaces * DEFAULT_TOKENS_PER_ITEM
        assumptions["tokensPerItem"] = DEFAULT_TOKENS_PER_ITEM
        assumptions["formula"] = "maxTasks * maxWorkspaces * 1500"
    else:
        estimatedTokens = 0
        assumptions["formula"] = "unknown kind, returning zero"

    estimatedChf = round(estimatedTokens / 1_000_000 * EMBEDDING_CHF_PER_MTOKEN, 4)

    return {
        "estimatedTokens": estimatedTokens,
        "estimatedChf": estimatedChf,
        "basis": {
            "kind": kind,
            "limits": dict(limits),
            "assumptions": assumptions,
            "notes": (
                "Indicative only. Actual cost depends on file types, extractable text "
                "ratio, dedup hit-rate, retries, and current embedding model pricing."
            ),
        },
    }