feat: add /v1/embeddings endpoint for mxbai-embed-large 1024 dim
All checks were successful
Deploy LLM Service / deploy (push) Successful in 22s
All checks were successful
Deploy LLM Service / deploy (push) Successful in 22s
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
b15d283941
commit
33fe5d55bd
3 changed files with 106 additions and 3 deletions
27
config.py
27
config.py
|
|
@ -158,6 +158,7 @@ MODEL_MAPPING = {
|
||||||
"poweron-text-general": "qwen2.5:7b",
|
"poweron-text-general": "qwen2.5:7b",
|
||||||
"poweron-vision-general": "qwen2.5vl:7b",
|
"poweron-vision-general": "qwen2.5vl:7b",
|
||||||
"poweron-vision-deep": "granite3.2-vision",
|
"poweron-vision-deep": "granite3.2-vision",
|
||||||
|
"poweron-embed": "mxbai-embed-large",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Next-gen models (RTX PRO 6000 96 GB VRAM — prepared, activate after migration)
|
# Next-gen models (RTX PRO 6000 96 GB VRAM — prepared, activate after migration)
|
||||||
|
|
@ -167,7 +168,7 @@ MODEL_MAPPING = {
|
||||||
# "poweron-text-reasoning": "deepseek-r1:70b",
|
# "poweron-text-reasoning": "deepseek-r1:70b",
|
||||||
# "poweron-vision-general": "llama4:scout",
|
# "poweron-vision-general": "llama4:scout",
|
||||||
# "poweron-vision-deep": "qwen2.5vl:72b",
|
# "poweron-vision-deep": "qwen2.5vl:72b",
|
||||||
# "poweron-embed": "nomic-embed-text",
|
# "poweron-embed": "mxbai-embed-large",
|
||||||
# "poweron-transcribe": "whisper-large-v3-turbo",
|
# "poweron-transcribe": "whisper-large-v3-turbo",
|
||||||
# }
|
# }
|
||||||
|
|
||||||
|
|
@ -264,6 +265,30 @@ class OpenAiChatCompletionResponse(BaseModel):
|
||||||
usage: OpenAiChatCompletionUsage
|
usage: OpenAiChatCompletionUsage
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAiEmbeddingRequest(BaseModel):
|
||||||
|
model: str
|
||||||
|
input: Any
|
||||||
|
dimensions: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAiEmbeddingData(BaseModel):
|
||||||
|
object: str = "embedding"
|
||||||
|
embedding: List[float]
|
||||||
|
index: int
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAiEmbeddingUsage(BaseModel):
|
||||||
|
promptTokens: int = Field(default=0, alias="prompt_tokens")
|
||||||
|
totalTokens: int = Field(default=0, alias="total_tokens")
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAiEmbeddingResponse(BaseModel):
|
||||||
|
object: str = "list"
|
||||||
|
data: List[OpenAiEmbeddingData]
|
||||||
|
model: str
|
||||||
|
usage: OpenAiEmbeddingUsage
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# Helper Functions
|
# Helper Functions
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
|
||||||
|
|
@ -62,6 +62,7 @@ Connect: ssh -i "C:\Users\pmots\Downloads\ollama-deploy-key.pem" ubuntu@83.228.2
|
||||||
| `poweron-text-general` | `qwen2.5:7b` | Text-Neutralisierung |
|
| `poweron-text-general` | `qwen2.5:7b` | Text-Neutralisierung |
|
||||||
| `poweron-vision-general` | `qwen2.5vl:7b` | Handschrift, Dokumente |
|
| `poweron-vision-general` | `qwen2.5vl:7b` | Handschrift, Dokumente |
|
||||||
| `poweron-vision-deep` | `granite3.2-vision` | Rechnungen, Belege |
|
| `poweron-vision-deep` | `granite3.2-vision` | Rechnungen, Belege |
|
||||||
|
| `poweron-embed` | `mxbai-embed-large` | Embedding (1024 dim, RAG failover) |
|
||||||
|
|
||||||
### URLs
|
### URLs
|
||||||
|
|
||||||
|
|
@ -581,6 +582,9 @@ ollama pull qwen2.5vl:7b
|
||||||
|
|
||||||
# Vision: Rechnungen, Belege
|
# Vision: Rechnungen, Belege
|
||||||
ollama pull granite3.2-vision
|
ollama pull granite3.2-vision
|
||||||
|
|
||||||
|
# Embedding: RAG multi-provider failover (1024 dim)
|
||||||
|
ollama pull mxbai-embed-large
|
||||||
```
|
```
|
||||||
|
|
||||||
### Modelle pruefen
|
### Modelle pruefen
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,14 @@
|
||||||
# Copyright (c) 2026 PowerOn AG
|
# Copyright (c) 2026 PowerOn AG
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
"""OpenAI-compatible routes for Cursor integration (/v1/models, /v1/chat/completions)."""
|
"""OpenAI-compatible routes (/v1/models, /v1/chat/completions, /v1/embeddings)."""
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
import logging
|
import logging
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from fastapi import APIRouter, HTTPException, Depends
|
from fastapi import APIRouter, HTTPException, Depends, Header
|
||||||
|
|
||||||
from config import (
|
from config import (
|
||||||
CONFIG, MODEL_MAPPING,
|
CONFIG, MODEL_MAPPING,
|
||||||
|
|
@ -17,6 +18,8 @@ from config import (
|
||||||
OpenAiChatCompletionRequest, OpenAiChatCompletionResponse,
|
OpenAiChatCompletionRequest, OpenAiChatCompletionResponse,
|
||||||
OpenAiChatCompletionChoice, OpenAiChatCompletionUsage,
|
OpenAiChatCompletionChoice, OpenAiChatCompletionUsage,
|
||||||
OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse,
|
OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse,
|
||||||
|
OpenAiEmbeddingRequest, OpenAiEmbeddingResponse,
|
||||||
|
OpenAiEmbeddingData, OpenAiEmbeddingUsage,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -143,3 +146,74 @@ async def _openAiChatCompletions(
|
||||||
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
|
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
|
||||||
except httpx.ConnectError:
|
except httpx.ConnectError:
|
||||||
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
|
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/v1/embeddings", response_model=OpenAiEmbeddingResponse)
|
||||||
|
async def _openAiEmbeddings(
|
||||||
|
request: OpenAiEmbeddingRequest,
|
||||||
|
xApiKey: Optional[str] = Header(None, alias="X-API-Key"),
|
||||||
|
):
|
||||||
|
"""OpenAI-compatible embeddings endpoint. Proxies to Ollama /api/embed."""
|
||||||
|
if xApiKey:
|
||||||
|
if CONFIG["apiKey"] and xApiKey != CONFIG["apiKey"]:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid API key")
|
||||||
|
allowed, info = rateLimiter.isAllowed(xApiKey)
|
||||||
|
if not allowed:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=429,
|
||||||
|
detail=f"Rate limit exceeded. Retry after {info['retryAfter']} seconds.",
|
||||||
|
headers={"Retry-After": str(int(info["retryAfter"]) + 1)},
|
||||||
|
)
|
||||||
|
|
||||||
|
internalModelName = _getInternalModelName(request.model)
|
||||||
|
|
||||||
|
texts = request.input if isinstance(request.input, list) else [request.input]
|
||||||
|
|
||||||
|
ollamaPayload = {"model": internalModelName, "input": texts}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||||
|
response = await client.post(
|
||||||
|
f"{CONFIG['ollamaUrl']}/api/embed",
|
||||||
|
json=ollamaPayload,
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 404:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail=f'Model "{internalModelName}" not found. Install with: ollama pull {internalModelName}',
|
||||||
|
)
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=response.status_code,
|
||||||
|
detail=f"Ollama API error: {response.status_code} - {response.text[:200]}",
|
||||||
|
)
|
||||||
|
|
||||||
|
responseData = response.json()
|
||||||
|
rawEmbeddings = responseData.get("embeddings", [])
|
||||||
|
|
||||||
|
totalTokens = responseData.get("prompt_eval_count", 0)
|
||||||
|
|
||||||
|
embeddingData = [
|
||||||
|
OpenAiEmbeddingData(embedding=vec, index=i)
|
||||||
|
for i, vec in enumerate(rawEmbeddings)
|
||||||
|
]
|
||||||
|
|
||||||
|
return OpenAiEmbeddingResponse(
|
||||||
|
data=embeddingData,
|
||||||
|
model=request.model,
|
||||||
|
usage=OpenAiEmbeddingUsage(
|
||||||
|
promptTokens=totalTokens,
|
||||||
|
totalTokens=totalTokens,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
|
||||||
|
except httpx.ConnectError:
|
||||||
|
raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in embedding endpoint: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue