From 33fe5d55bd073944c4e62bab162695c14ab369b9 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Wed, 10 Jun 2026 22:23:21 +0200
Subject: [PATCH] feat: add /v1/embeddings endpoint for mxbai-embed-large 1024
dim
Co-authored-by: Cursor
---
config.py | 27 +++++++++++++++-
docu/setupserver.md | 4 +++
routeOpenAi.py | 78 +++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 106 insertions(+), 3 deletions(-)
diff --git a/config.py b/config.py
index a30ae72..c39a6c8 100644
--- a/config.py
+++ b/config.py
@@ -158,6 +158,7 @@ MODEL_MAPPING = {
"poweron-text-general": "qwen2.5:7b",
"poweron-vision-general": "qwen2.5vl:7b",
"poweron-vision-deep": "granite3.2-vision",
+ "poweron-embed": "mxbai-embed-large",
}
# Next-gen models (RTX PRO 6000 96 GB VRAM — prepared, activate after migration)
@@ -167,7 +168,7 @@ MODEL_MAPPING = {
# "poweron-text-reasoning": "deepseek-r1:70b",
# "poweron-vision-general": "llama4:scout",
# "poweron-vision-deep": "qwen2.5vl:72b",
-# "poweron-embed": "nomic-embed-text",
+# "poweron-embed": "mxbai-embed-large",
# "poweron-transcribe": "whisper-large-v3-turbo",
# }
@@ -264,6 +265,30 @@ class OpenAiChatCompletionResponse(BaseModel):
usage: OpenAiChatCompletionUsage
+class OpenAiEmbeddingRequest(BaseModel):
+ model: str
+ input: Any
+ dimensions: Optional[int] = None
+
+
+class OpenAiEmbeddingData(BaseModel):
+ object: str = "embedding"
+ embedding: List[float]
+ index: int
+
+
+class OpenAiEmbeddingUsage(BaseModel):
+ promptTokens: int = Field(default=0, alias="prompt_tokens")
+ totalTokens: int = Field(default=0, alias="total_tokens")
+
+
+class OpenAiEmbeddingResponse(BaseModel):
+ object: str = "list"
+ data: List[OpenAiEmbeddingData]
+ model: str
+ usage: OpenAiEmbeddingUsage
+
+
# ============================================================================
# Helper Functions
# ============================================================================
diff --git a/docu/setupserver.md b/docu/setupserver.md
index 70fcbf3..de80f84 100644
--- a/docu/setupserver.md
+++ b/docu/setupserver.md
@@ -62,6 +62,7 @@ Connect: ssh -i "C:\Users\pmots\Downloads\ollama-deploy-key.pem" ubuntu@83.228.2
| `poweron-text-general` | `qwen2.5:7b` | Text-Neutralisierung |
| `poweron-vision-general` | `qwen2.5vl:7b` | Handschrift, Dokumente |
| `poweron-vision-deep` | `granite3.2-vision` | Rechnungen, Belege |
+| `poweron-embed` | `mxbai-embed-large` | Embedding (1024 dim, RAG failover) |
### URLs
@@ -581,6 +582,9 @@ ollama pull qwen2.5vl:7b
# Vision: Rechnungen, Belege
ollama pull granite3.2-vision
+
+# Embedding: RAG multi-provider failover (1024 dim)
+ollama pull mxbai-embed-large
```
### Modelle pruefen
diff --git a/routeOpenAi.py b/routeOpenAi.py
index 7ba9953..64c578e 100644
--- a/routeOpenAi.py
+++ b/routeOpenAi.py
@@ -1,13 +1,14 @@
# Copyright (c) 2026 PowerOn AG
# All rights reserved.
-"""OpenAI-compatible routes for Cursor integration (/v1/models, /v1/chat/completions)."""
+"""OpenAI-compatible routes (/v1/models, /v1/chat/completions, /v1/embeddings)."""
import time
import uuid
import logging
+from typing import List, Optional
import httpx
-from fastapi import APIRouter, HTTPException, Depends
+from fastapi import APIRouter, HTTPException, Depends, Header
from config import (
CONFIG, MODEL_MAPPING,
@@ -17,6 +18,8 @@ from config import (
OpenAiChatCompletionRequest, OpenAiChatCompletionResponse,
OpenAiChatCompletionChoice, OpenAiChatCompletionUsage,
OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse,
+ OpenAiEmbeddingRequest, OpenAiEmbeddingResponse,
+ OpenAiEmbeddingData, OpenAiEmbeddingUsage,
)
logger = logging.getLogger(__name__)
@@ -143,3 +146,74 @@ async def _openAiChatCompletions(
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
except httpx.ConnectError:
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
+
+
+@router.post("/v1/embeddings", response_model=OpenAiEmbeddingResponse)
+async def _openAiEmbeddings(
+ request: OpenAiEmbeddingRequest,
+ xApiKey: Optional[str] = Header(None, alias="X-API-Key"),
+):
+ """OpenAI-compatible embeddings endpoint. Proxies to Ollama /api/embed."""
+ if xApiKey:
+ if CONFIG["apiKey"] and xApiKey != CONFIG["apiKey"]:
+ raise HTTPException(status_code=401, detail="Invalid API key")
+ allowed, info = rateLimiter.isAllowed(xApiKey)
+ if not allowed:
+ raise HTTPException(
+ status_code=429,
+ detail=f"Rate limit exceeded. Retry after {info['retryAfter']} seconds.",
+ headers={"Retry-After": str(int(info["retryAfter"]) + 1)},
+ )
+
+ internalModelName = _getInternalModelName(request.model)
+
+ texts = request.input if isinstance(request.input, list) else [request.input]
+
+ ollamaPayload = {"model": internalModelName, "input": texts}
+
+ try:
+ async with httpx.AsyncClient(timeout=120.0) as client:
+ response = await client.post(
+ f"{CONFIG['ollamaUrl']}/api/embed",
+ json=ollamaPayload,
+ )
+
+ if response.status_code == 404:
+ raise HTTPException(
+ status_code=404,
+ detail=f'Model "{internalModelName}" not found. Install with: ollama pull {internalModelName}',
+ )
+ if response.status_code != 200:
+ raise HTTPException(
+ status_code=response.status_code,
+ detail=f"Ollama API error: {response.status_code} - {response.text[:200]}",
+ )
+
+ responseData = response.json()
+ rawEmbeddings = responseData.get("embeddings", [])
+
+ totalTokens = responseData.get("prompt_eval_count", 0)
+
+ embeddingData = [
+ OpenAiEmbeddingData(embedding=vec, index=i)
+ for i, vec in enumerate(rawEmbeddings)
+ ]
+
+ return OpenAiEmbeddingResponse(
+ data=embeddingData,
+ model=request.model,
+ usage=OpenAiEmbeddingUsage(
+ promptTokens=totalTokens,
+ totalTokens=totalTokens,
+ ),
+ )
+
+ except httpx.TimeoutException:
+ raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
+ except httpx.ConnectError:
+ raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Error in embedding endpoint: {e}")
+ raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")