feat: add /v1/embeddings endpoint for mxbai-embed-large 1024 dim
All checks were successful
Deploy LLM Service / deploy (push) Successful in 22s
All checks were successful
Deploy LLM Service / deploy (push) Successful in 22s
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
b15d283941
commit
33fe5d55bd
3 changed files with 106 additions and 3 deletions
27
config.py
27
config.py
|
|
@ -158,6 +158,7 @@ MODEL_MAPPING = {
|
|||
"poweron-text-general": "qwen2.5:7b",
|
||||
"poweron-vision-general": "qwen2.5vl:7b",
|
||||
"poweron-vision-deep": "granite3.2-vision",
|
||||
"poweron-embed": "mxbai-embed-large",
|
||||
}
|
||||
|
||||
# Next-gen models (RTX PRO 6000 96 GB VRAM — prepared, activate after migration)
|
||||
|
|
@ -167,7 +168,7 @@ MODEL_MAPPING = {
|
|||
# "poweron-text-reasoning": "deepseek-r1:70b",
|
||||
# "poweron-vision-general": "llama4:scout",
|
||||
# "poweron-vision-deep": "qwen2.5vl:72b",
|
||||
# "poweron-embed": "nomic-embed-text",
|
||||
# "poweron-embed": "mxbai-embed-large",
|
||||
# "poweron-transcribe": "whisper-large-v3-turbo",
|
||||
# }
|
||||
|
||||
|
|
@ -264,6 +265,30 @@ class OpenAiChatCompletionResponse(BaseModel):
|
|||
usage: OpenAiChatCompletionUsage
|
||||
|
||||
|
||||
class OpenAiEmbeddingRequest(BaseModel):
|
||||
model: str
|
||||
input: Any
|
||||
dimensions: Optional[int] = None
|
||||
|
||||
|
||||
class OpenAiEmbeddingData(BaseModel):
|
||||
object: str = "embedding"
|
||||
embedding: List[float]
|
||||
index: int
|
||||
|
||||
|
||||
class OpenAiEmbeddingUsage(BaseModel):
|
||||
promptTokens: int = Field(default=0, alias="prompt_tokens")
|
||||
totalTokens: int = Field(default=0, alias="total_tokens")
|
||||
|
||||
|
||||
class OpenAiEmbeddingResponse(BaseModel):
|
||||
object: str = "list"
|
||||
data: List[OpenAiEmbeddingData]
|
||||
model: str
|
||||
usage: OpenAiEmbeddingUsage
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Helper Functions
|
||||
# ============================================================================
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@ Connect: ssh -i "C:\Users\pmots\Downloads\ollama-deploy-key.pem" ubuntu@83.228.2
|
|||
| `poweron-text-general` | `qwen2.5:7b` | Text-Neutralisierung |
|
||||
| `poweron-vision-general` | `qwen2.5vl:7b` | Handschrift, Dokumente |
|
||||
| `poweron-vision-deep` | `granite3.2-vision` | Rechnungen, Belege |
|
||||
| `poweron-embed` | `mxbai-embed-large` | Embedding (1024 dim, RAG failover) |
|
||||
|
||||
### URLs
|
||||
|
||||
|
|
@ -581,6 +582,9 @@ ollama pull qwen2.5vl:7b
|
|||
|
||||
# Vision: Rechnungen, Belege
|
||||
ollama pull granite3.2-vision
|
||||
|
||||
# Embedding: RAG multi-provider failover (1024 dim)
|
||||
ollama pull mxbai-embed-large
|
||||
```
|
||||
|
||||
### Modelle pruefen
|
||||
|
|
|
|||
|
|
@ -1,13 +1,14 @@
|
|||
# Copyright (c) 2026 PowerOn AG
|
||||
# All rights reserved.
|
||||
"""OpenAI-compatible routes for Cursor integration (/v1/models, /v1/chat/completions)."""
|
||||
"""OpenAI-compatible routes (/v1/models, /v1/chat/completions, /v1/embeddings)."""
|
||||
|
||||
import time
|
||||
import uuid
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter, HTTPException, Depends
|
||||
from fastapi import APIRouter, HTTPException, Depends, Header
|
||||
|
||||
from config import (
|
||||
CONFIG, MODEL_MAPPING,
|
||||
|
|
@ -17,6 +18,8 @@ from config import (
|
|||
OpenAiChatCompletionRequest, OpenAiChatCompletionResponse,
|
||||
OpenAiChatCompletionChoice, OpenAiChatCompletionUsage,
|
||||
OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse,
|
||||
OpenAiEmbeddingRequest, OpenAiEmbeddingResponse,
|
||||
OpenAiEmbeddingData, OpenAiEmbeddingUsage,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -143,3 +146,74 @@ async def _openAiChatCompletions(
|
|||
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
|
||||
except httpx.ConnectError:
|
||||
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
|
||||
|
||||
|
||||
@router.post("/v1/embeddings", response_model=OpenAiEmbeddingResponse)
|
||||
async def _openAiEmbeddings(
|
||||
request: OpenAiEmbeddingRequest,
|
||||
xApiKey: Optional[str] = Header(None, alias="X-API-Key"),
|
||||
):
|
||||
"""OpenAI-compatible embeddings endpoint. Proxies to Ollama /api/embed."""
|
||||
if xApiKey:
|
||||
if CONFIG["apiKey"] and xApiKey != CONFIG["apiKey"]:
|
||||
raise HTTPException(status_code=401, detail="Invalid API key")
|
||||
allowed, info = rateLimiter.isAllowed(xApiKey)
|
||||
if not allowed:
|
||||
raise HTTPException(
|
||||
status_code=429,
|
||||
detail=f"Rate limit exceeded. Retry after {info['retryAfter']} seconds.",
|
||||
headers={"Retry-After": str(int(info["retryAfter"]) + 1)},
|
||||
)
|
||||
|
||||
internalModelName = _getInternalModelName(request.model)
|
||||
|
||||
texts = request.input if isinstance(request.input, list) else [request.input]
|
||||
|
||||
ollamaPayload = {"model": internalModelName, "input": texts}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
response = await client.post(
|
||||
f"{CONFIG['ollamaUrl']}/api/embed",
|
||||
json=ollamaPayload,
|
||||
)
|
||||
|
||||
if response.status_code == 404:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f'Model "{internalModelName}" not found. Install with: ollama pull {internalModelName}',
|
||||
)
|
||||
if response.status_code != 200:
|
||||
raise HTTPException(
|
||||
status_code=response.status_code,
|
||||
detail=f"Ollama API error: {response.status_code} - {response.text[:200]}",
|
||||
)
|
||||
|
||||
responseData = response.json()
|
||||
rawEmbeddings = responseData.get("embeddings", [])
|
||||
|
||||
totalTokens = responseData.get("prompt_eval_count", 0)
|
||||
|
||||
embeddingData = [
|
||||
OpenAiEmbeddingData(embedding=vec, index=i)
|
||||
for i, vec in enumerate(rawEmbeddings)
|
||||
]
|
||||
|
||||
return OpenAiEmbeddingResponse(
|
||||
data=embeddingData,
|
||||
model=request.model,
|
||||
usage=OpenAiEmbeddingUsage(
|
||||
promptTokens=totalTokens,
|
||||
totalTokens=totalTokens,
|
||||
),
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
|
||||
except httpx.ConnectError:
|
||||
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error in embedding endpoint: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")
|
||||
|
|
|
|||
Loading…
Reference in a new issue