feat: add /v1/embeddings endpoint for mxbai-embed-large 1024 dim
All checks were successful
Deploy LLM Service / deploy (push) Successful in 22s

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
ValueOn AG 2026-06-10 22:23:21 +02:00
parent b15d283941
commit 33fe5d55bd
3 changed files with 106 additions and 3 deletions

View file

@ -158,6 +158,7 @@ MODEL_MAPPING = {
"poweron-text-general": "qwen2.5:7b",
"poweron-vision-general": "qwen2.5vl:7b",
"poweron-vision-deep": "granite3.2-vision",
"poweron-embed": "mxbai-embed-large",
}
# Next-gen models (RTX PRO 6000 96 GB VRAM — prepared, activate after migration)
@ -167,7 +168,7 @@ MODEL_MAPPING = {
# "poweron-text-reasoning": "deepseek-r1:70b",
# "poweron-vision-general": "llama4:scout",
# "poweron-vision-deep": "qwen2.5vl:72b",
# "poweron-embed": "nomic-embed-text",
# "poweron-embed": "mxbai-embed-large",
# "poweron-transcribe": "whisper-large-v3-turbo",
# }
@ -264,6 +265,30 @@ class OpenAiChatCompletionResponse(BaseModel):
usage: OpenAiChatCompletionUsage
class OpenAiEmbeddingRequest(BaseModel):
model: str
input: Any
dimensions: Optional[int] = None
class OpenAiEmbeddingData(BaseModel):
object: str = "embedding"
embedding: List[float]
index: int
class OpenAiEmbeddingUsage(BaseModel):
promptTokens: int = Field(default=0, alias="prompt_tokens")
totalTokens: int = Field(default=0, alias="total_tokens")
class OpenAiEmbeddingResponse(BaseModel):
object: str = "list"
data: List[OpenAiEmbeddingData]
model: str
usage: OpenAiEmbeddingUsage
# ============================================================================
# Helper Functions
# ============================================================================

View file

@ -62,6 +62,7 @@ Connect: ssh -i "C:\Users\pmots\Downloads\ollama-deploy-key.pem" ubuntu@83.228.2
| `poweron-text-general` | `qwen2.5:7b` | Text-Neutralisierung |
| `poweron-vision-general` | `qwen2.5vl:7b` | Handschrift, Dokumente |
| `poweron-vision-deep` | `granite3.2-vision` | Rechnungen, Belege |
| `poweron-embed` | `mxbai-embed-large` | Embedding (1024 dim, RAG failover) |
### URLs
@ -581,6 +582,9 @@ ollama pull qwen2.5vl:7b
# Vision: Rechnungen, Belege
ollama pull granite3.2-vision
# Embedding: RAG multi-provider failover (1024 dim)
ollama pull mxbai-embed-large
```
### Modelle pruefen

View file

@ -1,13 +1,14 @@
# Copyright (c) 2026 PowerOn AG
# All rights reserved.
"""OpenAI-compatible routes for Cursor integration (/v1/models, /v1/chat/completions)."""
"""OpenAI-compatible routes (/v1/models, /v1/chat/completions, /v1/embeddings)."""
import time
import uuid
import logging
from typing import List, Optional
import httpx
from fastapi import APIRouter, HTTPException, Depends
from fastapi import APIRouter, HTTPException, Depends, Header
from config import (
CONFIG, MODEL_MAPPING,
@ -17,6 +18,8 @@ from config import (
OpenAiChatCompletionRequest, OpenAiChatCompletionResponse,
OpenAiChatCompletionChoice, OpenAiChatCompletionUsage,
OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse,
OpenAiEmbeddingRequest, OpenAiEmbeddingResponse,
OpenAiEmbeddingData, OpenAiEmbeddingUsage,
)
logger = logging.getLogger(__name__)
@ -143,3 +146,74 @@ async def _openAiChatCompletions(
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
except httpx.ConnectError:
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
@router.post("/v1/embeddings", response_model=OpenAiEmbeddingResponse)
async def _openAiEmbeddings(
request: OpenAiEmbeddingRequest,
xApiKey: Optional[str] = Header(None, alias="X-API-Key"),
):
"""OpenAI-compatible embeddings endpoint. Proxies to Ollama /api/embed."""
if xApiKey:
if CONFIG["apiKey"] and xApiKey != CONFIG["apiKey"]:
raise HTTPException(status_code=401, detail="Invalid API key")
allowed, info = rateLimiter.isAllowed(xApiKey)
if not allowed:
raise HTTPException(
status_code=429,
detail=f"Rate limit exceeded. Retry after {info['retryAfter']} seconds.",
headers={"Retry-After": str(int(info["retryAfter"]) + 1)},
)
internalModelName = _getInternalModelName(request.model)
texts = request.input if isinstance(request.input, list) else [request.input]
ollamaPayload = {"model": internalModelName, "input": texts}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
response = await client.post(
f"{CONFIG['ollamaUrl']}/api/embed",
json=ollamaPayload,
)
if response.status_code == 404:
raise HTTPException(
status_code=404,
detail=f'Model "{internalModelName}" not found. Install with: ollama pull {internalModelName}',
)
if response.status_code != 200:
raise HTTPException(
status_code=response.status_code,
detail=f"Ollama API error: {response.status_code} - {response.text[:200]}",
)
responseData = response.json()
rawEmbeddings = responseData.get("embeddings", [])
totalTokens = responseData.get("prompt_eval_count", 0)
embeddingData = [
OpenAiEmbeddingData(embedding=vec, index=i)
for i, vec in enumerate(rawEmbeddings)
]
return OpenAiEmbeddingResponse(
data=embeddingData,
model=request.model,
usage=OpenAiEmbeddingUsage(
promptTokens=totalTokens,
totalTokens=totalTokens,
),
)
except httpx.TimeoutException:
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
except httpx.ConnectError:
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in embedding endpoint: {e}")
raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")