service-llm-private/routeOpenAi.py
ValueOn AG 1f5d8e923b Refactor: extract routes and config from app.py into separate modules
Move all API routes, OpenAI-compatible routes, web UI routes, shared config, models, rate limiter, and auth logic into dedicated files (config.py, routeApi.py, routeOpenAi.py, routeWeb.py). app.py now serves as a clean entry point.

Made-with: Cursor
2026-03-30 14:49:35 +02:00

145 lines
4.8 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""OpenAI-compatible routes for Cursor integration (/v1/models, /v1/chat/completions)."""
import time
import uuid
import logging
import httpx
from fastapi import APIRouter, HTTPException, Depends
from config import (
CONFIG, MODEL_MAPPING,
rateLimiter,
_isVisionModel, _getInternalModelName, _messagesToPrompt,
_verifyCursorApiKey,
OpenAiChatCompletionRequest, OpenAiChatCompletionResponse,
OpenAiChatCompletionChoice, OpenAiChatCompletionUsage,
OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse,
)
logger = logging.getLogger(__name__)
router = APIRouter(tags=["OpenAI Compatible"])
@router.get("/v1/models", response_model=OpenAiModelsResponse)
async def _listOpenAiModels(cursorApiKey: str = Depends(_verifyCursorApiKey)):
"""OpenAI-compatible models endpoint for Cursor."""
createdAt = int(time.time())
modelData = []
for externalName in MODEL_MAPPING.keys():
modelData.append(
OpenAiModelInfo(
id=externalName,
created=createdAt
)
)
return OpenAiModelsResponse(data=modelData)
@router.post(
"/v1/chat/completions",
response_model=OpenAiChatCompletionResponse,
)
async def _openAiChatCompletions(
request: OpenAiChatCompletionRequest,
cursorApiKey: str = Depends(_verifyCursorApiKey)
):
"""OpenAI-compatible chat completions endpoint for Cursor."""
if request.stream:
raise HTTPException(
status_code=400,
detail="Streaming is not supported by this endpoint"
)
allowed, info = rateLimiter.isAllowed(f"cursor:{cursorApiKey}")
if not allowed:
raise HTTPException(
status_code=429,
detail={
"error": "Rate limit exceeded",
"message": f"Too many requests. Please retry after {info['retryAfter']} seconds.",
"retryAfter": info["retryAfter"],
"limit": info["limit"],
"remaining": info["remaining"]
},
headers={
"Retry-After": str(int(info["retryAfter"])),
"X-RateLimit-Limit": str(info["limit"]),
"X-RateLimit-Remaining": str(info["remaining"]),
"X-RateLimit-Reset": str(info["resetSeconds"])
}
)
promptText = _messagesToPrompt(request.messages).strip()
if not promptText:
raise HTTPException(status_code=400, detail="messages must contain text content")
internalModelName = _getInternalModelName(request.model)
if _isVisionModel(internalModelName):
raise HTTPException(
status_code=400,
detail="Vision models are not supported on /v1/chat/completions"
)
requestOptions = {
"num_ctx": 8192
}
if request.temperature is not None:
requestOptions["temperature"] = request.temperature
if request.maxTokens is not None:
requestOptions["num_predict"] = request.maxTokens
requestBody = {
"model": internalModelName,
"prompt": promptText,
"stream": False,
"options": requestOptions
}
try:
async with httpx.AsyncClient(timeout=3600.0) as client:
response = await client.post(
f"{CONFIG['ollamaUrl']}/api/generate",
json=requestBody
)
if response.status_code == 404:
raise HTTPException(
status_code=404,
detail=f'Model "{request.model}" not found'
)
if response.status_code != 200:
raise HTTPException(
status_code=response.status_code,
detail=f"Ollama API error: {response.status_code} - {response.text[:200]}"
)
responseData = response.json()
responseText = responseData.get("response", "").strip()
promptEvalCount = int(responseData.get("prompt_eval_count", 0))
evalCount = int(responseData.get("eval_count", 0))
return OpenAiChatCompletionResponse(
id=f"chatcmpl-{uuid.uuid4().hex}",
created=int(time.time()),
model=request.model,
choices=[
OpenAiChatCompletionChoice(
index=0,
message=OpenAiChatMessage(role="assistant", content=responseText)
)
],
usage=OpenAiChatCompletionUsage(
promptTokens=promptEvalCount,
completionTokens=evalCount,
totalTokens=promptEvalCount + evalCount
)
)
except httpx.TimeoutException:
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
except httpx.ConnectError:
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")