All checks were successful
Deploy LLM Service / deploy (push) Successful in 22s
Co-authored-by: Cursor <cursoragent@cursor.com>
219 lines
7.5 KiB
Python
219 lines
7.5 KiB
Python
# Copyright (c) 2026 PowerOn AG
|
|
# All rights reserved.
|
|
"""OpenAI-compatible routes (/v1/models, /v1/chat/completions, /v1/embeddings)."""
|
|
|
|
import time
|
|
import uuid
|
|
import logging
|
|
from typing import List, Optional
|
|
|
|
import httpx
|
|
from fastapi import APIRouter, HTTPException, Depends, Header
|
|
|
|
from config import (
|
|
CONFIG, MODEL_MAPPING,
|
|
rateLimiter,
|
|
_isVisionModel, _getInternalModelName, _messagesToPrompt,
|
|
_verifyCursorApiKey,
|
|
OpenAiChatCompletionRequest, OpenAiChatCompletionResponse,
|
|
OpenAiChatCompletionChoice, OpenAiChatCompletionUsage,
|
|
OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse,
|
|
OpenAiEmbeddingRequest, OpenAiEmbeddingResponse,
|
|
OpenAiEmbeddingData, OpenAiEmbeddingUsage,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(tags=["OpenAI Compatible"])
|
|
|
|
|
|
@router.get("/v1/models", response_model=OpenAiModelsResponse)
|
|
async def _listOpenAiModels(cursorApiKey: str = Depends(_verifyCursorApiKey)):
|
|
"""OpenAI-compatible models endpoint for Cursor."""
|
|
createdAt = int(time.time())
|
|
modelData = []
|
|
for externalName in MODEL_MAPPING.keys():
|
|
modelData.append(
|
|
OpenAiModelInfo(
|
|
id=externalName,
|
|
created=createdAt
|
|
)
|
|
)
|
|
return OpenAiModelsResponse(data=modelData)
|
|
|
|
|
|
@router.post(
|
|
"/v1/chat/completions",
|
|
response_model=OpenAiChatCompletionResponse,
|
|
)
|
|
async def _openAiChatCompletions(
|
|
request: OpenAiChatCompletionRequest,
|
|
cursorApiKey: str = Depends(_verifyCursorApiKey)
|
|
):
|
|
"""OpenAI-compatible chat completions endpoint for Cursor."""
|
|
if request.stream:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Streaming is not supported by this endpoint"
|
|
)
|
|
|
|
allowed, info = rateLimiter.isAllowed(f"cursor:{cursorApiKey}")
|
|
if not allowed:
|
|
raise HTTPException(
|
|
status_code=429,
|
|
detail={
|
|
"error": "Rate limit exceeded",
|
|
"message": f"Too many requests. Please retry after {info['retryAfter']} seconds.",
|
|
"retryAfter": info["retryAfter"],
|
|
"limit": info["limit"],
|
|
"remaining": info["remaining"]
|
|
},
|
|
headers={
|
|
"Retry-After": str(int(info["retryAfter"])),
|
|
"X-RateLimit-Limit": str(info["limit"]),
|
|
"X-RateLimit-Remaining": str(info["remaining"]),
|
|
"X-RateLimit-Reset": str(info["resetSeconds"])
|
|
}
|
|
)
|
|
|
|
promptText = _messagesToPrompt(request.messages).strip()
|
|
if not promptText:
|
|
raise HTTPException(status_code=400, detail="messages must contain text content")
|
|
|
|
internalModelName = _getInternalModelName(request.model)
|
|
if _isVisionModel(internalModelName):
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Vision models are not supported on /v1/chat/completions"
|
|
)
|
|
|
|
requestOptions = {
|
|
"num_ctx": 8192
|
|
}
|
|
if request.temperature is not None:
|
|
requestOptions["temperature"] = request.temperature
|
|
if request.maxTokens is not None:
|
|
requestOptions["num_predict"] = request.maxTokens
|
|
|
|
requestBody = {
|
|
"model": internalModelName,
|
|
"prompt": promptText,
|
|
"stream": False,
|
|
"options": requestOptions
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=3600.0) as client:
|
|
response = await client.post(
|
|
f"{CONFIG['ollamaUrl']}/api/generate",
|
|
json=requestBody
|
|
)
|
|
|
|
if response.status_code == 404:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f'Model "{request.model}" not found'
|
|
)
|
|
if response.status_code != 200:
|
|
raise HTTPException(
|
|
status_code=response.status_code,
|
|
detail=f"Ollama API error: {response.status_code} - {response.text[:200]}"
|
|
)
|
|
|
|
responseData = response.json()
|
|
responseText = responseData.get("response", "").strip()
|
|
promptEvalCount = int(responseData.get("prompt_eval_count", 0))
|
|
evalCount = int(responseData.get("eval_count", 0))
|
|
|
|
return OpenAiChatCompletionResponse(
|
|
id=f"chatcmpl-{uuid.uuid4().hex}",
|
|
created=int(time.time()),
|
|
model=request.model,
|
|
choices=[
|
|
OpenAiChatCompletionChoice(
|
|
index=0,
|
|
message=OpenAiChatMessage(role="assistant", content=responseText)
|
|
)
|
|
],
|
|
usage=OpenAiChatCompletionUsage(
|
|
promptTokens=promptEvalCount,
|
|
completionTokens=evalCount,
|
|
totalTokens=promptEvalCount + evalCount
|
|
)
|
|
)
|
|
|
|
except httpx.TimeoutException:
|
|
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
|
|
except httpx.ConnectError:
|
|
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
|
|
|
|
|
|
@router.post("/v1/embeddings", response_model=OpenAiEmbeddingResponse)
|
|
async def _openAiEmbeddings(
|
|
request: OpenAiEmbeddingRequest,
|
|
xApiKey: Optional[str] = Header(None, alias="X-API-Key"),
|
|
):
|
|
"""OpenAI-compatible embeddings endpoint. Proxies to Ollama /api/embed."""
|
|
if xApiKey:
|
|
if CONFIG["apiKey"] and xApiKey != CONFIG["apiKey"]:
|
|
raise HTTPException(status_code=401, detail="Invalid API key")
|
|
allowed, info = rateLimiter.isAllowed(xApiKey)
|
|
if not allowed:
|
|
raise HTTPException(
|
|
status_code=429,
|
|
detail=f"Rate limit exceeded. Retry after {info['retryAfter']} seconds.",
|
|
headers={"Retry-After": str(int(info["retryAfter"]) + 1)},
|
|
)
|
|
|
|
internalModelName = _getInternalModelName(request.model)
|
|
|
|
texts = request.input if isinstance(request.input, list) else [request.input]
|
|
|
|
ollamaPayload = {"model": internalModelName, "input": texts}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
response = await client.post(
|
|
f"{CONFIG['ollamaUrl']}/api/embed",
|
|
json=ollamaPayload,
|
|
)
|
|
|
|
if response.status_code == 404:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f'Model "{internalModelName}" not found. Install with: ollama pull {internalModelName}',
|
|
)
|
|
if response.status_code != 200:
|
|
raise HTTPException(
|
|
status_code=response.status_code,
|
|
detail=f"Ollama API error: {response.status_code} - {response.text[:200]}",
|
|
)
|
|
|
|
responseData = response.json()
|
|
rawEmbeddings = responseData.get("embeddings", [])
|
|
|
|
totalTokens = responseData.get("prompt_eval_count", 0)
|
|
|
|
embeddingData = [
|
|
OpenAiEmbeddingData(embedding=vec, index=i)
|
|
for i, vec in enumerate(rawEmbeddings)
|
|
]
|
|
|
|
return OpenAiEmbeddingResponse(
|
|
data=embeddingData,
|
|
model=request.model,
|
|
usage=OpenAiEmbeddingUsage(
|
|
promptTokens=totalTokens,
|
|
totalTokens=totalTokens,
|
|
),
|
|
)
|
|
|
|
except httpx.TimeoutException:
|
|
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
|
|
except httpx.ConnectError:
|
|
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error in embedding endpoint: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")
|