# Copyright (c) 2026 PowerOn AG # All rights reserved. """OpenAI-compatible routes (/v1/models, /v1/chat/completions, /v1/embeddings).""" import time import uuid import logging from typing import List, Optional import httpx from fastapi import APIRouter, HTTPException, Depends, Header from config import ( CONFIG, MODEL_MAPPING, rateLimiter, _isVisionModel, _getInternalModelName, _messagesToPrompt, _verifyCursorApiKey, OpenAiChatCompletionRequest, OpenAiChatCompletionResponse, OpenAiChatCompletionChoice, OpenAiChatCompletionUsage, OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse, OpenAiEmbeddingRequest, OpenAiEmbeddingResponse, OpenAiEmbeddingData, OpenAiEmbeddingUsage, ) logger = logging.getLogger(__name__) router = APIRouter(tags=["OpenAI Compatible"]) @router.get("/v1/models", response_model=OpenAiModelsResponse) async def _listOpenAiModels(cursorApiKey: str = Depends(_verifyCursorApiKey)): """OpenAI-compatible models endpoint for Cursor.""" createdAt = int(time.time()) modelData = [] for externalName in MODEL_MAPPING.keys(): modelData.append( OpenAiModelInfo( id=externalName, created=createdAt ) ) return OpenAiModelsResponse(data=modelData) @router.post( "/v1/chat/completions", response_model=OpenAiChatCompletionResponse, ) async def _openAiChatCompletions( request: OpenAiChatCompletionRequest, cursorApiKey: str = Depends(_verifyCursorApiKey) ): """OpenAI-compatible chat completions endpoint for Cursor.""" if request.stream: raise HTTPException( status_code=400, detail="Streaming is not supported by this endpoint" ) allowed, info = rateLimiter.isAllowed(f"cursor:{cursorApiKey}") if not allowed: raise HTTPException( status_code=429, detail={ "error": "Rate limit exceeded", "message": f"Too many requests. Please retry after {info['retryAfter']} seconds.", "retryAfter": info["retryAfter"], "limit": info["limit"], "remaining": info["remaining"] }, headers={ "Retry-After": str(int(info["retryAfter"])), "X-RateLimit-Limit": str(info["limit"]), "X-RateLimit-Remaining": str(info["remaining"]), "X-RateLimit-Reset": str(info["resetSeconds"]) } ) promptText = _messagesToPrompt(request.messages).strip() if not promptText: raise HTTPException(status_code=400, detail="messages must contain text content") internalModelName = _getInternalModelName(request.model) if _isVisionModel(internalModelName): raise HTTPException( status_code=400, detail="Vision models are not supported on /v1/chat/completions" ) requestOptions = { "num_ctx": 8192 } if request.temperature is not None: requestOptions["temperature"] = request.temperature if request.maxTokens is not None: requestOptions["num_predict"] = request.maxTokens requestBody = { "model": internalModelName, "prompt": promptText, "stream": False, "options": requestOptions } try: async with httpx.AsyncClient(timeout=3600.0) as client: response = await client.post( f"{CONFIG['ollamaUrl']}/api/generate", json=requestBody ) if response.status_code == 404: raise HTTPException( status_code=404, detail=f'Model "{request.model}" not found' ) if response.status_code != 200: raise HTTPException( status_code=response.status_code, detail=f"Ollama API error: {response.status_code} - {response.text[:200]}" ) responseData = response.json() responseText = responseData.get("response", "").strip() promptEvalCount = int(responseData.get("prompt_eval_count", 0)) evalCount = int(responseData.get("eval_count", 0)) return OpenAiChatCompletionResponse( id=f"chatcmpl-{uuid.uuid4().hex}", created=int(time.time()), model=request.model, choices=[ OpenAiChatCompletionChoice( index=0, message=OpenAiChatMessage(role="assistant", content=responseText) ) ], usage=OpenAiChatCompletionUsage( promptTokens=promptEvalCount, completionTokens=evalCount, totalTokens=promptEvalCount + evalCount ) ) except httpx.TimeoutException: raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)") except httpx.ConnectError: raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream") @router.post("/v1/embeddings", response_model=OpenAiEmbeddingResponse) async def _openAiEmbeddings( request: OpenAiEmbeddingRequest, xApiKey: Optional[str] = Header(None, alias="X-API-Key"), ): """OpenAI-compatible embeddings endpoint. Proxies to Ollama /api/embed.""" if xApiKey: if CONFIG["apiKey"] and xApiKey != CONFIG["apiKey"]: raise HTTPException(status_code=401, detail="Invalid API key") allowed, info = rateLimiter.isAllowed(xApiKey) if not allowed: raise HTTPException( status_code=429, detail=f"Rate limit exceeded. Retry after {info['retryAfter']} seconds.", headers={"Retry-After": str(int(info["retryAfter"]) + 1)}, ) internalModelName = _getInternalModelName(request.model) texts = request.input if isinstance(request.input, list) else [request.input] ollamaPayload = {"model": internalModelName, "input": texts} try: async with httpx.AsyncClient(timeout=120.0) as client: response = await client.post( f"{CONFIG['ollamaUrl']}/api/embed", json=ollamaPayload, ) if response.status_code == 404: raise HTTPException( status_code=404, detail=f'Model "{internalModelName}" not found. Install with: ollama pull {internalModelName}', ) if response.status_code != 200: raise HTTPException( status_code=response.status_code, detail=f"Ollama API error: {response.status_code} - {response.text[:200]}", ) responseData = response.json() rawEmbeddings = responseData.get("embeddings", []) totalTokens = responseData.get("prompt_eval_count", 0) embeddingData = [ OpenAiEmbeddingData(embedding=vec, index=i) for i, vec in enumerate(rawEmbeddings) ] return OpenAiEmbeddingResponse( data=embeddingData, model=request.model, usage=OpenAiEmbeddingUsage( promptTokens=totalTokens, totalTokens=totalTokens, ), ) except httpx.TimeoutException: raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)") except httpx.ConnectError: raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream") except HTTPException: raise except Exception as e: logger.error(f"Error in embedding endpoint: {e}") raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")