# Copyright (c) 2025 Patrick Motsch # All rights reserved. """OpenAI-compatible routes for Cursor integration (/v1/models, /v1/chat/completions).""" import time import uuid import logging import httpx from fastapi import APIRouter, HTTPException, Depends from config import ( CONFIG, MODEL_MAPPING, rateLimiter, _isVisionModel, _getInternalModelName, _messagesToPrompt, _verifyCursorApiKey, OpenAiChatCompletionRequest, OpenAiChatCompletionResponse, OpenAiChatCompletionChoice, OpenAiChatCompletionUsage, OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse, ) logger = logging.getLogger(__name__) router = APIRouter(tags=["OpenAI Compatible"]) @router.get("/v1/models", response_model=OpenAiModelsResponse) async def _listOpenAiModels(cursorApiKey: str = Depends(_verifyCursorApiKey)): """OpenAI-compatible models endpoint for Cursor.""" createdAt = int(time.time()) modelData = [] for externalName in MODEL_MAPPING.keys(): modelData.append( OpenAiModelInfo( id=externalName, created=createdAt ) ) return OpenAiModelsResponse(data=modelData) @router.post( "/v1/chat/completions", response_model=OpenAiChatCompletionResponse, ) async def _openAiChatCompletions( request: OpenAiChatCompletionRequest, cursorApiKey: str = Depends(_verifyCursorApiKey) ): """OpenAI-compatible chat completions endpoint for Cursor.""" if request.stream: raise HTTPException( status_code=400, detail="Streaming is not supported by this endpoint" ) allowed, info = rateLimiter.isAllowed(f"cursor:{cursorApiKey}") if not allowed: raise HTTPException( status_code=429, detail={ "error": "Rate limit exceeded", "message": f"Too many requests. Please retry after {info['retryAfter']} seconds.", "retryAfter": info["retryAfter"], "limit": info["limit"], "remaining": info["remaining"] }, headers={ "Retry-After": str(int(info["retryAfter"])), "X-RateLimit-Limit": str(info["limit"]), "X-RateLimit-Remaining": str(info["remaining"]), "X-RateLimit-Reset": str(info["resetSeconds"]) } ) promptText = _messagesToPrompt(request.messages).strip() if not promptText: raise HTTPException(status_code=400, detail="messages must contain text content") internalModelName = _getInternalModelName(request.model) if _isVisionModel(internalModelName): raise HTTPException( status_code=400, detail="Vision models are not supported on /v1/chat/completions" ) requestOptions = { "num_ctx": 8192 } if request.temperature is not None: requestOptions["temperature"] = request.temperature if request.maxTokens is not None: requestOptions["num_predict"] = request.maxTokens requestBody = { "model": internalModelName, "prompt": promptText, "stream": False, "options": requestOptions } try: async with httpx.AsyncClient(timeout=3600.0) as client: response = await client.post( f"{CONFIG['ollamaUrl']}/api/generate", json=requestBody ) if response.status_code == 404: raise HTTPException( status_code=404, detail=f'Model "{request.model}" not found' ) if response.status_code != 200: raise HTTPException( status_code=response.status_code, detail=f"Ollama API error: {response.status_code} - {response.text[:200]}" ) responseData = response.json() responseText = responseData.get("response", "").strip() promptEvalCount = int(responseData.get("prompt_eval_count", 0)) evalCount = int(responseData.get("eval_count", 0)) return OpenAiChatCompletionResponse( id=f"chatcmpl-{uuid.uuid4().hex}", created=int(time.time()), model=request.model, choices=[ OpenAiChatCompletionChoice( index=0, message=OpenAiChatMessage(role="assistant", content=responseText) ) ], usage=OpenAiChatCompletionUsage( promptTokens=promptEvalCount, completionTokens=evalCount, totalTokens=promptEvalCount + evalCount ) ) except httpx.TimeoutException: raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)") except httpx.ConnectError: raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")