service-llm-private/routeOpenAi.py
ValueOn AG 33fe5d55bd
All checks were successful
Deploy LLM Service / deploy (push) Successful in 22s
feat: add /v1/embeddings endpoint for mxbai-embed-large 1024 dim
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-10 22:23:21 +02:00

219 lines
7.5 KiB
Python

# Copyright (c) 2026 PowerOn AG
# All rights reserved.
"""OpenAI-compatible routes (/v1/models, /v1/chat/completions, /v1/embeddings)."""
import time
import uuid
import logging
from typing import List, Optional
import httpx
from fastapi import APIRouter, HTTPException, Depends, Header
from config import (
CONFIG, MODEL_MAPPING,
rateLimiter,
_isVisionModel, _getInternalModelName, _messagesToPrompt,
_verifyCursorApiKey,
OpenAiChatCompletionRequest, OpenAiChatCompletionResponse,
OpenAiChatCompletionChoice, OpenAiChatCompletionUsage,
OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse,
OpenAiEmbeddingRequest, OpenAiEmbeddingResponse,
OpenAiEmbeddingData, OpenAiEmbeddingUsage,
)
logger = logging.getLogger(__name__)
router = APIRouter(tags=["OpenAI Compatible"])
@router.get("/v1/models", response_model=OpenAiModelsResponse)
async def _listOpenAiModels(cursorApiKey: str = Depends(_verifyCursorApiKey)):
"""OpenAI-compatible models endpoint for Cursor."""
createdAt = int(time.time())
modelData = []
for externalName in MODEL_MAPPING.keys():
modelData.append(
OpenAiModelInfo(
id=externalName,
created=createdAt
)
)
return OpenAiModelsResponse(data=modelData)
@router.post(
"/v1/chat/completions",
response_model=OpenAiChatCompletionResponse,
)
async def _openAiChatCompletions(
request: OpenAiChatCompletionRequest,
cursorApiKey: str = Depends(_verifyCursorApiKey)
):
"""OpenAI-compatible chat completions endpoint for Cursor."""
if request.stream:
raise HTTPException(
status_code=400,
detail="Streaming is not supported by this endpoint"
)
allowed, info = rateLimiter.isAllowed(f"cursor:{cursorApiKey}")
if not allowed:
raise HTTPException(
status_code=429,
detail={
"error": "Rate limit exceeded",
"message": f"Too many requests. Please retry after {info['retryAfter']} seconds.",
"retryAfter": info["retryAfter"],
"limit": info["limit"],
"remaining": info["remaining"]
},
headers={
"Retry-After": str(int(info["retryAfter"])),
"X-RateLimit-Limit": str(info["limit"]),
"X-RateLimit-Remaining": str(info["remaining"]),
"X-RateLimit-Reset": str(info["resetSeconds"])
}
)
promptText = _messagesToPrompt(request.messages).strip()
if not promptText:
raise HTTPException(status_code=400, detail="messages must contain text content")
internalModelName = _getInternalModelName(request.model)
if _isVisionModel(internalModelName):
raise HTTPException(
status_code=400,
detail="Vision models are not supported on /v1/chat/completions"
)
requestOptions = {
"num_ctx": 8192
}
if request.temperature is not None:
requestOptions["temperature"] = request.temperature
if request.maxTokens is not None:
requestOptions["num_predict"] = request.maxTokens
requestBody = {
"model": internalModelName,
"prompt": promptText,
"stream": False,
"options": requestOptions
}
try:
async with httpx.AsyncClient(timeout=3600.0) as client:
response = await client.post(
f"{CONFIG['ollamaUrl']}/api/generate",
json=requestBody
)
if response.status_code == 404:
raise HTTPException(
status_code=404,
detail=f'Model "{request.model}" not found'
)
if response.status_code != 200:
raise HTTPException(
status_code=response.status_code,
detail=f"Ollama API error: {response.status_code} - {response.text[:200]}"
)
responseData = response.json()
responseText = responseData.get("response", "").strip()
promptEvalCount = int(responseData.get("prompt_eval_count", 0))
evalCount = int(responseData.get("eval_count", 0))
return OpenAiChatCompletionResponse(
id=f"chatcmpl-{uuid.uuid4().hex}",
created=int(time.time()),
model=request.model,
choices=[
OpenAiChatCompletionChoice(
index=0,
message=OpenAiChatMessage(role="assistant", content=responseText)
)
],
usage=OpenAiChatCompletionUsage(
promptTokens=promptEvalCount,
completionTokens=evalCount,
totalTokens=promptEvalCount + evalCount
)
)
except httpx.TimeoutException:
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
except httpx.ConnectError:
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
@router.post("/v1/embeddings", response_model=OpenAiEmbeddingResponse)
async def _openAiEmbeddings(
request: OpenAiEmbeddingRequest,
xApiKey: Optional[str] = Header(None, alias="X-API-Key"),
):
"""OpenAI-compatible embeddings endpoint. Proxies to Ollama /api/embed."""
if xApiKey:
if CONFIG["apiKey"] and xApiKey != CONFIG["apiKey"]:
raise HTTPException(status_code=401, detail="Invalid API key")
allowed, info = rateLimiter.isAllowed(xApiKey)
if not allowed:
raise HTTPException(
status_code=429,
detail=f"Rate limit exceeded. Retry after {info['retryAfter']} seconds.",
headers={"Retry-After": str(int(info["retryAfter"]) + 1)},
)
internalModelName = _getInternalModelName(request.model)
texts = request.input if isinstance(request.input, list) else [request.input]
ollamaPayload = {"model": internalModelName, "input": texts}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
response = await client.post(
f"{CONFIG['ollamaUrl']}/api/embed",
json=ollamaPayload,
)
if response.status_code == 404:
raise HTTPException(
status_code=404,
detail=f'Model "{internalModelName}" not found. Install with: ollama pull {internalModelName}',
)
if response.status_code != 200:
raise HTTPException(
status_code=response.status_code,
detail=f"Ollama API error: {response.status_code} - {response.text[:200]}",
)
responseData = response.json()
rawEmbeddings = responseData.get("embeddings", [])
totalTokens = responseData.get("prompt_eval_count", 0)
embeddingData = [
OpenAiEmbeddingData(embedding=vec, index=i)
for i, vec in enumerate(rawEmbeddings)
]
return OpenAiEmbeddingResponse(
data=embeddingData,
model=request.model,
usage=OpenAiEmbeddingUsage(
promptTokens=totalTokens,
totalTokens=totalTokens,
),
)
except httpx.TimeoutException:
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
except httpx.ConnectError:
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in embedding endpoint: {e}")
raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")