diff --git a/config.py b/config.py index a30ae72..c39a6c8 100644 --- a/config.py +++ b/config.py @@ -158,6 +158,7 @@ MODEL_MAPPING = { "poweron-text-general": "qwen2.5:7b", "poweron-vision-general": "qwen2.5vl:7b", "poweron-vision-deep": "granite3.2-vision", + "poweron-embed": "mxbai-embed-large", } # Next-gen models (RTX PRO 6000 96 GB VRAM — prepared, activate after migration) @@ -167,7 +168,7 @@ MODEL_MAPPING = { # "poweron-text-reasoning": "deepseek-r1:70b", # "poweron-vision-general": "llama4:scout", # "poweron-vision-deep": "qwen2.5vl:72b", -# "poweron-embed": "nomic-embed-text", +# "poweron-embed": "mxbai-embed-large", # "poweron-transcribe": "whisper-large-v3-turbo", # } @@ -264,6 +265,30 @@ class OpenAiChatCompletionResponse(BaseModel): usage: OpenAiChatCompletionUsage +class OpenAiEmbeddingRequest(BaseModel): + model: str + input: Any + dimensions: Optional[int] = None + + +class OpenAiEmbeddingData(BaseModel): + object: str = "embedding" + embedding: List[float] + index: int + + +class OpenAiEmbeddingUsage(BaseModel): + promptTokens: int = Field(default=0, alias="prompt_tokens") + totalTokens: int = Field(default=0, alias="total_tokens") + + +class OpenAiEmbeddingResponse(BaseModel): + object: str = "list" + data: List[OpenAiEmbeddingData] + model: str + usage: OpenAiEmbeddingUsage + + # ============================================================================ # Helper Functions # ============================================================================ diff --git a/docu/setupserver.md b/docu/setupserver.md index 70fcbf3..de80f84 100644 --- a/docu/setupserver.md +++ b/docu/setupserver.md @@ -62,6 +62,7 @@ Connect: ssh -i "C:\Users\pmots\Downloads\ollama-deploy-key.pem" ubuntu@83.228.2 | `poweron-text-general` | `qwen2.5:7b` | Text-Neutralisierung | | `poweron-vision-general` | `qwen2.5vl:7b` | Handschrift, Dokumente | | `poweron-vision-deep` | `granite3.2-vision` | Rechnungen, Belege | +| `poweron-embed` | `mxbai-embed-large` | Embedding (1024 dim, RAG failover) | ### URLs @@ -581,6 +582,9 @@ ollama pull qwen2.5vl:7b # Vision: Rechnungen, Belege ollama pull granite3.2-vision + +# Embedding: RAG multi-provider failover (1024 dim) +ollama pull mxbai-embed-large ``` ### Modelle pruefen diff --git a/routeOpenAi.py b/routeOpenAi.py index 7ba9953..64c578e 100644 --- a/routeOpenAi.py +++ b/routeOpenAi.py @@ -1,13 +1,14 @@ # Copyright (c) 2026 PowerOn AG # All rights reserved. -"""OpenAI-compatible routes for Cursor integration (/v1/models, /v1/chat/completions).""" +"""OpenAI-compatible routes (/v1/models, /v1/chat/completions, /v1/embeddings).""" import time import uuid import logging +from typing import List, Optional import httpx -from fastapi import APIRouter, HTTPException, Depends +from fastapi import APIRouter, HTTPException, Depends, Header from config import ( CONFIG, MODEL_MAPPING, @@ -17,6 +18,8 @@ from config import ( OpenAiChatCompletionRequest, OpenAiChatCompletionResponse, OpenAiChatCompletionChoice, OpenAiChatCompletionUsage, OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse, + OpenAiEmbeddingRequest, OpenAiEmbeddingResponse, + OpenAiEmbeddingData, OpenAiEmbeddingUsage, ) logger = logging.getLogger(__name__) @@ -143,3 +146,74 @@ async def _openAiChatCompletions( raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)") except httpx.ConnectError: raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream") + + +@router.post("/v1/embeddings", response_model=OpenAiEmbeddingResponse) +async def _openAiEmbeddings( + request: OpenAiEmbeddingRequest, + xApiKey: Optional[str] = Header(None, alias="X-API-Key"), +): + """OpenAI-compatible embeddings endpoint. Proxies to Ollama /api/embed.""" + if xApiKey: + if CONFIG["apiKey"] and xApiKey != CONFIG["apiKey"]: + raise HTTPException(status_code=401, detail="Invalid API key") + allowed, info = rateLimiter.isAllowed(xApiKey) + if not allowed: + raise HTTPException( + status_code=429, + detail=f"Rate limit exceeded. Retry after {info['retryAfter']} seconds.", + headers={"Retry-After": str(int(info["retryAfter"]) + 1)}, + ) + + internalModelName = _getInternalModelName(request.model) + + texts = request.input if isinstance(request.input, list) else [request.input] + + ollamaPayload = {"model": internalModelName, "input": texts} + + try: + async with httpx.AsyncClient(timeout=120.0) as client: + response = await client.post( + f"{CONFIG['ollamaUrl']}/api/embed", + json=ollamaPayload, + ) + + if response.status_code == 404: + raise HTTPException( + status_code=404, + detail=f'Model "{internalModelName}" not found. Install with: ollama pull {internalModelName}', + ) + if response.status_code != 200: + raise HTTPException( + status_code=response.status_code, + detail=f"Ollama API error: {response.status_code} - {response.text[:200]}", + ) + + responseData = response.json() + rawEmbeddings = responseData.get("embeddings", []) + + totalTokens = responseData.get("prompt_eval_count", 0) + + embeddingData = [ + OpenAiEmbeddingData(embedding=vec, index=i) + for i, vec in enumerate(rawEmbeddings) + ] + + return OpenAiEmbeddingResponse( + data=embeddingData, + model=request.model, + usage=OpenAiEmbeddingUsage( + promptTokens=totalTokens, + totalTokens=totalTokens, + ), + ) + + except httpx.TimeoutException: + raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)") + except httpx.ConnectError: + raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream") + except HTTPException: + raise + except Exception as e: + logger.error(f"Error in embedding endpoint: {e}") + raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")