feat: add /v1/embeddings endpoint for mxbai-embed-large 1024 dim
All checks were successful
Deploy LLM Service / deploy (push) Successful in 22s

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
ValueOn AG 2026-06-10 22:23:21 +02:00
parent b15d283941
commit 33fe5d55bd
3 changed files with 106 additions and 3 deletions

View file

@ -158,6 +158,7 @@ MODEL_MAPPING = {
"poweron-text-general": "qwen2.5:7b", "poweron-text-general": "qwen2.5:7b",
"poweron-vision-general": "qwen2.5vl:7b", "poweron-vision-general": "qwen2.5vl:7b",
"poweron-vision-deep": "granite3.2-vision", "poweron-vision-deep": "granite3.2-vision",
"poweron-embed": "mxbai-embed-large",
} }
# Next-gen models (RTX PRO 6000 96 GB VRAM — prepared, activate after migration) # Next-gen models (RTX PRO 6000 96 GB VRAM — prepared, activate after migration)
@ -167,7 +168,7 @@ MODEL_MAPPING = {
# "poweron-text-reasoning": "deepseek-r1:70b", # "poweron-text-reasoning": "deepseek-r1:70b",
# "poweron-vision-general": "llama4:scout", # "poweron-vision-general": "llama4:scout",
# "poweron-vision-deep": "qwen2.5vl:72b", # "poweron-vision-deep": "qwen2.5vl:72b",
# "poweron-embed": "nomic-embed-text", # "poweron-embed": "mxbai-embed-large",
# "poweron-transcribe": "whisper-large-v3-turbo", # "poweron-transcribe": "whisper-large-v3-turbo",
# } # }
@ -264,6 +265,30 @@ class OpenAiChatCompletionResponse(BaseModel):
usage: OpenAiChatCompletionUsage usage: OpenAiChatCompletionUsage
class OpenAiEmbeddingRequest(BaseModel):
model: str
input: Any
dimensions: Optional[int] = None
class OpenAiEmbeddingData(BaseModel):
object: str = "embedding"
embedding: List[float]
index: int
class OpenAiEmbeddingUsage(BaseModel):
promptTokens: int = Field(default=0, alias="prompt_tokens")
totalTokens: int = Field(default=0, alias="total_tokens")
class OpenAiEmbeddingResponse(BaseModel):
object: str = "list"
data: List[OpenAiEmbeddingData]
model: str
usage: OpenAiEmbeddingUsage
# ============================================================================ # ============================================================================
# Helper Functions # Helper Functions
# ============================================================================ # ============================================================================

View file

@ -62,6 +62,7 @@ Connect: ssh -i "C:\Users\pmots\Downloads\ollama-deploy-key.pem" ubuntu@83.228.2
| `poweron-text-general` | `qwen2.5:7b` | Text-Neutralisierung | | `poweron-text-general` | `qwen2.5:7b` | Text-Neutralisierung |
| `poweron-vision-general` | `qwen2.5vl:7b` | Handschrift, Dokumente | | `poweron-vision-general` | `qwen2.5vl:7b` | Handschrift, Dokumente |
| `poweron-vision-deep` | `granite3.2-vision` | Rechnungen, Belege | | `poweron-vision-deep` | `granite3.2-vision` | Rechnungen, Belege |
| `poweron-embed` | `mxbai-embed-large` | Embedding (1024 dim, RAG failover) |
### URLs ### URLs
@ -581,6 +582,9 @@ ollama pull qwen2.5vl:7b
# Vision: Rechnungen, Belege # Vision: Rechnungen, Belege
ollama pull granite3.2-vision ollama pull granite3.2-vision
# Embedding: RAG multi-provider failover (1024 dim)
ollama pull mxbai-embed-large
``` ```
### Modelle pruefen ### Modelle pruefen

View file

@ -1,13 +1,14 @@
# Copyright (c) 2026 PowerOn AG # Copyright (c) 2026 PowerOn AG
# All rights reserved. # All rights reserved.
"""OpenAI-compatible routes for Cursor integration (/v1/models, /v1/chat/completions).""" """OpenAI-compatible routes (/v1/models, /v1/chat/completions, /v1/embeddings)."""
import time import time
import uuid import uuid
import logging import logging
from typing import List, Optional
import httpx import httpx
from fastapi import APIRouter, HTTPException, Depends from fastapi import APIRouter, HTTPException, Depends, Header
from config import ( from config import (
CONFIG, MODEL_MAPPING, CONFIG, MODEL_MAPPING,
@ -17,6 +18,8 @@ from config import (
OpenAiChatCompletionRequest, OpenAiChatCompletionResponse, OpenAiChatCompletionRequest, OpenAiChatCompletionResponse,
OpenAiChatCompletionChoice, OpenAiChatCompletionUsage, OpenAiChatCompletionChoice, OpenAiChatCompletionUsage,
OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse, OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse,
OpenAiEmbeddingRequest, OpenAiEmbeddingResponse,
OpenAiEmbeddingData, OpenAiEmbeddingUsage,
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -143,3 +146,74 @@ async def _openAiChatCompletions(
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)") raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
except httpx.ConnectError: except httpx.ConnectError:
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream") raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
@router.post("/v1/embeddings", response_model=OpenAiEmbeddingResponse)
async def _openAiEmbeddings(
request: OpenAiEmbeddingRequest,
xApiKey: Optional[str] = Header(None, alias="X-API-Key"),
):
"""OpenAI-compatible embeddings endpoint. Proxies to Ollama /api/embed."""
if xApiKey:
if CONFIG["apiKey"] and xApiKey != CONFIG["apiKey"]:
raise HTTPException(status_code=401, detail="Invalid API key")
allowed, info = rateLimiter.isAllowed(xApiKey)
if not allowed:
raise HTTPException(
status_code=429,
detail=f"Rate limit exceeded. Retry after {info['retryAfter']} seconds.",
headers={"Retry-After": str(int(info["retryAfter"]) + 1)},
)
internalModelName = _getInternalModelName(request.model)
texts = request.input if isinstance(request.input, list) else [request.input]
ollamaPayload = {"model": internalModelName, "input": texts}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
response = await client.post(
f"{CONFIG['ollamaUrl']}/api/embed",
json=ollamaPayload,
)
if response.status_code == 404:
raise HTTPException(
status_code=404,
detail=f'Model "{internalModelName}" not found. Install with: ollama pull {internalModelName}',
)
if response.status_code != 200:
raise HTTPException(
status_code=response.status_code,
detail=f"Ollama API error: {response.status_code} - {response.text[:200]}",
)
responseData = response.json()
rawEmbeddings = responseData.get("embeddings", [])
totalTokens = responseData.get("prompt_eval_count", 0)
embeddingData = [
OpenAiEmbeddingData(embedding=vec, index=i)
for i, vec in enumerate(rawEmbeddings)
]
return OpenAiEmbeddingResponse(
data=embeddingData,
model=request.model,
usage=OpenAiEmbeddingUsage(
promptTokens=totalTokens,
totalTokens=totalTokens,
),
)
except httpx.TimeoutException:
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
except httpx.ConnectError:
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in embedding endpoint: {e}")
raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")