From 1f5d8e923b2a8facddddee6ae78308c249cad544 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Mon, 30 Mar 2026 14:49:35 +0200
Subject: [PATCH] Refactor: extract routes and config from app.py into separate
modules
Move all API routes, OpenAI-compatible routes, web UI routes, shared config, models, rate limiter, and auth logic into dedicated files (config.py, routeApi.py, routeOpenAi.py, routeWeb.py). app.py now serves as a clean entry point.
Made-with: Cursor
---
app.py | 859 +------------------------------------------------
config.py | 435 +++++++++++++++++++++++++
routeApi.py | 256 +++++++++++++++
routeOpenAi.py | 145 +++++++++
routeWeb.py | 34 ++
5 files changed, 886 insertions(+), 843 deletions(-)
create mode 100644 config.py
create mode 100644 routeApi.py
create mode 100644 routeOpenAi.py
create mode 100644 routeWeb.py
diff --git a/app.py b/app.py
index 4d26ed1..86013c1 100644
--- a/app.py
+++ b/app.py
@@ -10,36 +10,15 @@ Models exposed:
- poweron-vision-deep (granite3.2)
"""
-import os
-import sys
-import base64
-import json
-import re
import logging
-import time
-import uuid
-from collections import defaultdict
-from typing import Optional, List, Dict, Any
from contextlib import asynccontextmanager
-from fastapi import FastAPI, HTTPException, Depends, Header, Request
+from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
-from fastapi.templating import Jinja2Templates
-from pydantic import BaseModel, Field
-import httpx
-# PDF Support
-try:
- import fitz # PyMuPDF
- PDF_SUPPORT = True
-except ImportError:
- PDF_SUPPORT = False
- print("WARNUNG: PyMuPDF nicht installiert. PDF-Support deaktiviert.")
- print("Installieren mit: pip install pymupdf")
+from config import CONFIG
-# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -47,444 +26,6 @@ logging.basicConfig(
)
logger = logging.getLogger(__name__)
-# ============================================================================
-# Configuration
-# ============================================================================
-
-def _loadConfig() -> Dict[str, Any]:
- """Load configuration from config.ini file."""
- configPath = os.path.join(os.path.dirname(__file__), "config.ini")
- config = {
- "apiKey": None,
- "cursorApiKey": None,
- "ollamaUrl": "http://localhost:11434",
- "authUsername": "poweron",
- "authPassword": "poweron",
- "secretKey": "poweron-secret-key-change-in-production",
- "rateLimitRequestsPerMinute": 60,
- "rateLimitBurstSize": 10,
- }
-
- if os.path.exists(configPath):
- try:
- with open(configPath, "r") as f:
- for line in f:
- line = line.strip()
- if not line or line.startswith("#"):
- continue
- if "=" in line:
- key, value = line.split("=", 1)
- key = key.strip()
- value = value.strip()
-
- # Map config keys
- if key == "PRIVATE_LLM_API_KEY":
- config["apiKey"] = value
- elif key == "CURSOR_API_KEY":
- config["cursorApiKey"] = value
- elif key == "OLLAMA_URL":
- config["ollamaUrl"] = value
- elif key == "AUTH_USERNAME":
- config["authUsername"] = value
- elif key == "AUTH_PASSWORD":
- config["authPassword"] = value
- elif key == "SECRET_KEY":
- config["secretKey"] = value
- elif key == "RATE_LIMIT_REQUESTS_PER_MINUTE":
- config["rateLimitRequestsPerMinute"] = int(value)
- elif key == "RATE_LIMIT_BURST_SIZE":
- config["rateLimitBurstSize"] = int(value)
- except Exception as e:
- logger.warning(f"Error loading config.ini: {e}")
-
- # Override with environment variables if set
- config["apiKey"] = os.environ.get("PRIVATE_LLM_API_KEY", config["apiKey"])
- config["cursorApiKey"] = os.environ.get("CURSOR_API_KEY", config["cursorApiKey"])
- config["ollamaUrl"] = os.environ.get("OLLAMA_URL", config["ollamaUrl"])
- config["authUsername"] = os.environ.get("AUTH_USERNAME", config["authUsername"])
- config["authPassword"] = os.environ.get("AUTH_PASSWORD", config["authPassword"])
- config["secretKey"] = os.environ.get("SECRET_KEY", config["secretKey"])
- config["rateLimitRequestsPerMinute"] = int(os.environ.get("RATE_LIMIT_REQUESTS_PER_MINUTE", config["rateLimitRequestsPerMinute"]))
- config["rateLimitBurstSize"] = int(os.environ.get("RATE_LIMIT_BURST_SIZE", config["rateLimitBurstSize"]))
-
- return config
-
-CONFIG = _loadConfig()
-
-
-# ============================================================================
-# Rate Limiting (Token Bucket Algorithm)
-# ============================================================================
-
-class RateLimiter:
- """
- Token bucket rate limiter with per-API-key tracking.
-
- Each API key gets its own bucket. Tokens are added at a constant rate
- (requestsPerMinute / 60 per second) up to a maximum burst size.
- """
-
- def __init__(self, requestsPerMinute: int = 60, burstSize: int = 10):
- self.requestsPerMinute = requestsPerMinute
- self.burstSize = burstSize
- self.tokensPerSecond = requestsPerMinute / 60.0
-
- # Track tokens and last update time per API key
- # Format: {apiKey: {"tokens": float, "lastUpdate": float}}
- self._buckets: Dict[str, Dict[str, float]] = defaultdict(
- lambda: {"tokens": burstSize, "lastUpdate": time.time()}
- )
-
- def _refillTokens(self, bucket: Dict[str, float]) -> None:
- """Refill tokens based on elapsed time."""
- now = time.time()
- elapsed = now - bucket["lastUpdate"]
- bucket["tokens"] = min(
- self.burstSize,
- bucket["tokens"] + elapsed * self.tokensPerSecond
- )
- bucket["lastUpdate"] = now
-
- def isAllowed(self, apiKey: str) -> tuple[bool, Dict[str, Any]]:
- """
- Check if a request is allowed and consume a token if so.
-
- Returns:
- Tuple of (allowed: bool, info: dict with remaining tokens and retry_after)
- """
- bucket = self._buckets[apiKey]
- self._refillTokens(bucket)
-
- if bucket["tokens"] >= 1.0:
- bucket["tokens"] -= 1.0
- return True, {
- "remaining": int(bucket["tokens"]),
- "limit": self.requestsPerMinute,
- "resetSeconds": 60
- }
- else:
- # Calculate when the next token will be available
- retryAfter = (1.0 - bucket["tokens"]) / self.tokensPerSecond
- return False, {
- "remaining": 0,
- "limit": self.requestsPerMinute,
- "retryAfter": round(retryAfter, 1),
- "resetSeconds": 60
- }
-
- def cleanup(self, maxAgeSeconds: int = 3600) -> int:
- """Remove stale buckets to prevent memory growth."""
- now = time.time()
- staleKeys = [
- key for key, bucket in self._buckets.items()
- if now - bucket["lastUpdate"] > maxAgeSeconds
- ]
- for key in staleKeys:
- del self._buckets[key]
- return len(staleKeys)
-
-
-# Global rate limiter instance
-rateLimiter = RateLimiter(
- requestsPerMinute=CONFIG["rateLimitRequestsPerMinute"],
- burstSize=CONFIG["rateLimitBurstSize"]
-)
-
-# Model mapping: external name -> internal Ollama model name
-# Production models (optimized for 32GB RAM server):
-# - qwen2.5:7b: 7.6B params, 128K context, ~4.7GB RAM (Text)
-# - qwen2.5vl:7b: 8.29B params, 125K context, ~6GB RAM (Vision)
-# - granite3.2-vision: 2B params, 16K context, ~2.4GB RAM (Vision)
-MODEL_MAPPING = {
- "poweron-text-general": "qwen2.5:7b",
- "poweron-vision-general": "qwen2.5vl:7b",
- "poweron-vision-deep": "granite3.2-vision",
-}
-
-# Reverse mapping for lookups
-INTERNAL_TO_EXTERNAL = {v: k for k, v in MODEL_MAPPING.items()}
-
-# ============================================================================
-# Request/Response Models
-# ============================================================================
-
-class AnalyzeRequest(BaseModel):
- """Request model for document analysis."""
- imageBase64: Optional[str] = Field(default=None, description="Base64 encoded image")
- prompt: str = Field(description="Analysis prompt")
- modelName: str = Field(default="poweron-vision-general", description="Model to use")
-
-class AnalyzeResponse(BaseModel):
- """Response model for document analysis."""
- success: bool = Field(description="Whether the analysis was successful")
- data: Optional[Dict[str, Any]] = Field(default=None, description="Extracted data")
- rawResponse: Optional[str] = Field(default=None, description="Raw model response")
- error: Optional[str] = Field(default=None, description="Error message if failed")
-
-class PdfExtractRequest(BaseModel):
- """Request model for PDF extraction."""
- pdfBase64: str = Field(description="Base64 encoded PDF")
- page: Optional[int] = Field(default=None, description="Specific page number (1-indexed)")
-
-class ModelInfo(BaseModel):
- """Model information."""
- name: str = Field(description="External model name")
- internalName: str = Field(description="Internal Ollama model name")
- isVision: bool = Field(description="Whether it's a vision model")
- pricePerCall: float = Field(description="Price per call in CHF")
-
-class HealthResponse(BaseModel):
- """Health check response."""
- status: str
- service: str
- pdfSupport: bool
- ollamaConnected: bool
-
-class OllamaStatusResponse(BaseModel):
- """Ollama status response."""
- connected: bool
- models: Optional[List[str]] = None
- visionModels: Optional[List[str]] = None
- totalModels: Optional[int] = None
- error: Optional[str] = None
-
-
-class OpenAiModelInfo(BaseModel):
- """OpenAI-compatible model object."""
- id: str
- object: str = "model"
- created: int
- ownedBy: str = Field(default="poweron", alias="owned_by")
-
-
-class OpenAiModelsResponse(BaseModel):
- """OpenAI-compatible models list response."""
- object: str = "list"
- data: List[OpenAiModelInfo]
-
-
-class OpenAiChatMessage(BaseModel):
- """OpenAI-compatible chat message."""
- role: str
- content: Any
-
-
-class OpenAiChatCompletionRequest(BaseModel):
- """OpenAI-compatible chat completion request."""
- model: str
- messages: List[OpenAiChatMessage]
- stream: Optional[bool] = False
- maxTokens: Optional[int] = Field(default=None, alias="max_tokens")
- temperature: Optional[float] = None
-
-
-class OpenAiChatCompletionChoice(BaseModel):
- """OpenAI-compatible completion choice."""
- index: int
- message: OpenAiChatMessage
- finishReason: str = Field(default="stop", alias="finish_reason")
-
-
-class OpenAiChatCompletionUsage(BaseModel):
- """OpenAI-compatible token usage."""
- promptTokens: int = Field(default=0, alias="prompt_tokens")
- completionTokens: int = Field(default=0, alias="completion_tokens")
- totalTokens: int = Field(default=0, alias="total_tokens")
-
-
-class OpenAiChatCompletionResponse(BaseModel):
- """OpenAI-compatible chat completion response."""
- id: str
- object: str = "chat.completion"
- created: int
- model: str
- choices: List[OpenAiChatCompletionChoice]
- usage: OpenAiChatCompletionUsage
-
-# ============================================================================
-# PDF Helper Functions
-# ============================================================================
-
-def _extractImagesFromPdf(pdfBytes: bytes, maxPages: int = 5) -> List[Dict[str, Any]]:
- """Extract images from a PDF."""
- if not PDF_SUPPORT:
- raise Exception("PDF-Support nicht verfügbar. Bitte PyMuPDF installieren.")
-
- images = []
- doc = fitz.open(stream=pdfBytes, filetype="pdf")
- numPages = min(len(doc), maxPages)
-
- for pageNum in range(numPages):
- page = doc[pageNum]
- mat = fitz.Matrix(2.0, 2.0) # 2x Zoom for better quality
- pix = page.get_pixmap(matrix=mat)
- imgBytes = pix.tobytes("png")
- imgBase64 = base64.b64encode(imgBytes).decode("utf-8")
-
- images.append({
- "page": pageNum + 1,
- "base64": imgBase64,
- "width": pix.width,
- "height": pix.height
- })
-
- doc.close()
- return images
-
-def _renderPdfPageAsImage(pdfBytes: bytes, pageNum: int = 0, zoom: float = 2.0) -> Dict[str, Any]:
- """Render a single PDF page as an image."""
- if not PDF_SUPPORT:
- raise Exception("PDF-Support nicht verfügbar.")
-
- doc = fitz.open(stream=pdfBytes, filetype="pdf")
-
- if pageNum >= len(doc):
- pageNum = len(doc) - 1
-
- page = doc[pageNum]
- mat = fitz.Matrix(zoom, zoom)
- pix = page.get_pixmap(matrix=mat)
- imgBytes = pix.tobytes("png")
- imgBase64 = base64.b64encode(imgBytes).decode("utf-8")
-
- result = {
- "base64": imgBase64,
- "width": pix.width,
- "height": pix.height,
- "page": pageNum + 1,
- "totalPages": len(doc)
- }
-
- doc.close()
- return result
-
-# ============================================================================
-# Model Helper Functions
-# ============================================================================
-
-def _isVisionModel(modelName: str) -> bool:
- """Check if a model is a vision model based on naming conventions."""
- if not modelName:
- return False
-
- modelLower = modelName.lower()
- visionIndicators = ["vision", "vl", "llava", "bakllava", "granite"]
-
- return any(indicator in modelLower for indicator in visionIndicators)
-
-def _getInternalModelName(externalName: str) -> str:
- """Get the internal Ollama model name from external name."""
- return MODEL_MAPPING.get(externalName, externalName)
-
-def _getExternalModelName(internalName: str) -> str:
- """Get the external model name from internal Ollama name."""
- return INTERNAL_TO_EXTERNAL.get(internalName, internalName)
-
-
-def _contentToText(content: Any) -> str:
- """Normalize OpenAI message content into plain text."""
- if content is None:
- return ""
- if isinstance(content, str):
- return content
- if isinstance(content, list):
- textParts = []
- for part in content:
- if isinstance(part, str):
- textParts.append(part)
- continue
- if isinstance(part, dict):
- partText = part.get("text")
- if isinstance(partText, str):
- textParts.append(partText)
- return "\n".join([part for part in textParts if part.strip()])
- if isinstance(content, dict):
- contentText = content.get("text")
- if isinstance(contentText, str):
- return contentText
- return str(content)
-
-
-def _messagesToPrompt(messages: List[OpenAiChatMessage]) -> str:
- """Convert OpenAI chat messages to a single prompt for Ollama generate."""
- promptLines = []
- for message in messages:
- normalizedText = _contentToText(message.content).strip()
- if not normalizedText:
- continue
- promptLines.append(f"{message.role}: {normalizedText}")
-
- if not promptLines:
- return ""
-
- promptLines.append("assistant:")
- return "\n\n".join(promptLines)
-
-# ============================================================================
-# Authentication & Rate Limiting
-# ============================================================================
-
-async def _verifyApiKey(xApiKey: Optional[str] = Header(None, alias="X-API-Key")) -> str:
- """Verify the API key from header and return it for rate limiting."""
- if not CONFIG["apiKey"]:
- # No API key configured, allow all requests (development mode)
- logger.warning("No API key configured - running in development mode")
- return "dev-mode"
-
- if not xApiKey:
- raise HTTPException(status_code=401, detail="API key required")
-
- if xApiKey != CONFIG["apiKey"]:
- raise HTTPException(status_code=401, detail="Invalid API key")
-
- return xApiKey
-
-
-async def _verifyCursorApiKey(authorization: Optional[str] = Header(None)) -> str:
- """Verify Bearer token for Cursor OpenAI-compatible endpoints."""
- expectedApiKey = CONFIG.get("cursorApiKey")
- if not expectedApiKey:
- raise HTTPException(
- status_code=503,
- detail="Cursor API key not configured on server"
- )
-
- if not authorization:
- raise HTTPException(status_code=401, detail="Authorization header required")
-
- if not authorization.startswith("Bearer "):
- raise HTTPException(status_code=401, detail="Bearer token required")
-
- providedApiKey = authorization[len("Bearer "):].strip()
- if providedApiKey != expectedApiKey:
- raise HTTPException(status_code=401, detail="Invalid API key")
-
- return providedApiKey
-
-
-async def _checkRateLimit(apiKey: str = Depends(_verifyApiKey)) -> str:
- """Check rate limit for the authenticated API key."""
- allowed, info = rateLimiter.isAllowed(apiKey)
-
- if not allowed:
- raise HTTPException(
- status_code=429,
- detail={
- "error": "Rate limit exceeded",
- "message": f"Too many requests. Please retry after {info['retryAfter']} seconds.",
- "retryAfter": info["retryAfter"],
- "limit": info["limit"],
- "remaining": info["remaining"]
- },
- headers={
- "Retry-After": str(int(info["retryAfter"])),
- "X-RateLimit-Limit": str(info["limit"]),
- "X-RateLimit-Remaining": str(info["remaining"]),
- "X-RateLimit-Reset": str(info["resetSeconds"])
- }
- )
-
- return apiKey
# ============================================================================
# Application Lifecycle
@@ -496,10 +37,12 @@ async def lifespan(app: FastAPI):
logger.info("Private-LLM Service starting up...")
logger.info(f"Ollama URL: {CONFIG['ollamaUrl']}")
logger.info(f"API Key configured: {'Yes' if CONFIG['apiKey'] else 'No (development mode)'}")
+ from config import PDF_SUPPORT
logger.info(f"PDF Support: {'Enabled' if PDF_SUPPORT else 'Disabled'}")
yield
logger.info("Private-LLM Service shutting down...")
+
# ============================================================================
# FastAPI Application
# ============================================================================
@@ -511,7 +54,7 @@ app = FastAPI(
lifespan=lifespan,
)
-# CORS Configuration - Allow gateway instances
+# CORS Configuration
ALLOWED_ORIGINS = [
"http://localhost:8000",
"http://localhost:8080",
@@ -521,13 +64,11 @@ ALLOWED_ORIGINS = [
"http://127.0.0.1:5000",
]
-# Add production origins
PRODUCTION_PATTERNS = [
"poweron.swiss",
"poweron-center.net",
]
-# Build full origins list with https variants
for pattern in PRODUCTION_PATTERNS:
ALLOWED_ORIGINS.extend([
f"https://{pattern}",
@@ -539,7 +80,6 @@ for pattern in PRODUCTION_PATTERNS:
f"https://playground.{pattern}",
])
-# Allow all subdomains via regex in middleware
app.add_middleware(
CORSMiddleware,
allow_origins=ALLOWED_ORIGINS,
@@ -551,390 +91,23 @@ app.add_middleware(
max_age=86400,
)
-# Static files and templates (for web UI)
+# Static files (for web UI)
app.mount("/static", StaticFiles(directory="static"), name="static")
-templates = Jinja2Templates(directory="templates")
+
# ============================================================================
-# API Routes
+# Route Registration
# ============================================================================
-@app.get("/api/health", response_model=HealthResponse, tags=["System"])
-async def _healthCheck():
- """Health check endpoint."""
- ollamaConnected = False
- try:
- async with httpx.AsyncClient(timeout=5.0) as client:
- response = await client.get(f"{CONFIG['ollamaUrl']}/api/tags")
- ollamaConnected = response.status_code == 200
- except Exception:
- pass
-
- return HealthResponse(
- status="ok",
- service="private-llm",
- pdfSupport=PDF_SUPPORT,
- ollamaConnected=ollamaConnected
- )
+from routeApi import router as apiRouter
+app.include_router(apiRouter)
-@app.get("/api/models", response_model=List[ModelInfo], tags=["Models"])
-async def _listModels(authenticated: bool = Depends(_verifyApiKey)):
- """List available models with pricing."""
- models = []
- for externalName, internalName in MODEL_MAPPING.items():
- isVision = _isVisionModel(internalName)
- pricePerCall = 0.10 if isVision else 0.01 # CHF pricing
-
- models.append(ModelInfo(
- name=externalName,
- internalName=internalName,
- isVision=isVision,
- pricePerCall=pricePerCall
- ))
-
- return models
+from routeOpenAi import router as openAiRouter
+app.include_router(openAiRouter)
+from routeWeb import router as webRouter
+app.include_router(webRouter)
-@app.get("/v1/models", response_model=OpenAiModelsResponse, tags=["OpenAI Compatible"])
-async def _listOpenAiModels(cursorApiKey: str = Depends(_verifyCursorApiKey)):
- """OpenAI-compatible models endpoint for Cursor."""
- createdAt = int(time.time())
- modelData = []
- for externalName in MODEL_MAPPING.keys():
- modelData.append(
- OpenAiModelInfo(
- id=externalName,
- created=createdAt
- )
- )
- return OpenAiModelsResponse(data=modelData)
-
-
-@app.post(
- "/v1/chat/completions",
- response_model=OpenAiChatCompletionResponse,
- tags=["OpenAI Compatible"]
-)
-async def _openAiChatCompletions(
- request: OpenAiChatCompletionRequest,
- cursorApiKey: str = Depends(_verifyCursorApiKey)
-):
- """OpenAI-compatible chat completions endpoint for Cursor."""
- if request.stream:
- raise HTTPException(
- status_code=400,
- detail="Streaming is not supported by this endpoint"
- )
-
- allowed, info = rateLimiter.isAllowed(f"cursor:{cursorApiKey}")
- if not allowed:
- raise HTTPException(
- status_code=429,
- detail={
- "error": "Rate limit exceeded",
- "message": f"Too many requests. Please retry after {info['retryAfter']} seconds.",
- "retryAfter": info["retryAfter"],
- "limit": info["limit"],
- "remaining": info["remaining"]
- },
- headers={
- "Retry-After": str(int(info["retryAfter"])),
- "X-RateLimit-Limit": str(info["limit"]),
- "X-RateLimit-Remaining": str(info["remaining"]),
- "X-RateLimit-Reset": str(info["resetSeconds"])
- }
- )
-
- promptText = _messagesToPrompt(request.messages).strip()
- if not promptText:
- raise HTTPException(status_code=400, detail="messages must contain text content")
-
- internalModelName = _getInternalModelName(request.model)
- if _isVisionModel(internalModelName):
- raise HTTPException(
- status_code=400,
- detail="Vision models are not supported on /v1/chat/completions"
- )
-
- requestOptions = {
- "num_ctx": 8192
- }
- if request.temperature is not None:
- requestOptions["temperature"] = request.temperature
- if request.maxTokens is not None:
- requestOptions["num_predict"] = request.maxTokens
-
- requestBody = {
- "model": internalModelName,
- "prompt": promptText,
- "stream": False,
- "options": requestOptions
- }
-
- try:
- async with httpx.AsyncClient(timeout=3600.0) as client:
- response = await client.post(
- f"{CONFIG['ollamaUrl']}/api/generate",
- json=requestBody
- )
-
- if response.status_code == 404:
- raise HTTPException(
- status_code=404,
- detail=f'Model "{request.model}" not found'
- )
- if response.status_code != 200:
- raise HTTPException(
- status_code=response.status_code,
- detail=f"Ollama API error: {response.status_code} - {response.text[:200]}"
- )
-
- responseData = response.json()
- responseText = responseData.get("response", "").strip()
- promptEvalCount = int(responseData.get("prompt_eval_count", 0))
- evalCount = int(responseData.get("eval_count", 0))
-
- return OpenAiChatCompletionResponse(
- id=f"chatcmpl-{uuid.uuid4().hex}",
- created=int(time.time()),
- model=request.model,
- choices=[
- OpenAiChatCompletionChoice(
- index=0,
- message=OpenAiChatMessage(role="assistant", content=responseText)
- )
- ],
- usage=OpenAiChatCompletionUsage(
- promptTokens=promptEvalCount,
- completionTokens=evalCount,
- totalTokens=promptEvalCount + evalCount
- )
- )
-
- except httpx.TimeoutException:
- raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
- except httpx.ConnectError:
- raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
-
-@app.get("/api/ollama/status", response_model=OllamaStatusResponse, tags=["System"])
-async def _ollamaStatus():
- """Check Ollama connection status and list available models."""
- try:
- async with httpx.AsyncClient(timeout=10.0) as client:
- response = await client.get(f"{CONFIG['ollamaUrl']}/api/tags")
-
- if response.status_code != 200:
- return OllamaStatusResponse(
- connected=False,
- error=f"Ollama responded with status {response.status_code}"
- )
-
- data = response.json()
- models = [m.get("name", "") for m in data.get("models", [])]
- visionModels = [m for m in models if _isVisionModel(m)]
-
- return OllamaStatusResponse(
- connected=True,
- models=models,
- visionModels=visionModels,
- totalModels=len(models)
- )
-
- except httpx.ConnectError:
- return OllamaStatusResponse(
- connected=False,
- error="Keine Verbindung zu Ollama. Ist Ollama gestartet?"
- )
- except Exception as e:
- return OllamaStatusResponse(
- connected=False,
- error=str(e)
- )
-
-@app.post("/api/analyze", response_model=AnalyzeResponse, tags=["AI"])
-async def _analyzeDocument(
- request: AnalyzeRequest,
- xApiKey: Optional[str] = Header(None, alias="X-API-Key")
-):
- """
- Analyze a document with AI Vision API.
-
- Supports both vision models (with images) and text models (without images).
-
- Authentication:
- - Gateway calls: Must include X-API-Key header
- - Test UI calls: No auth required (same-origin)
-
- Rate limiting is applied when API key is provided.
- """
- # Apply rate limiting only for authenticated requests (Gateway)
- if xApiKey:
- if CONFIG["apiKey"] and xApiKey != CONFIG["apiKey"]:
- raise HTTPException(status_code=401, detail="Invalid API key")
- # Check rate limit for authenticated requests
- allowed, info = rateLimiter.isAllowed(xApiKey)
- if not allowed:
- raise HTTPException(
- status_code=429,
- detail=f"Rate limit exceeded. Retry after {info['retryAfter']} seconds.",
- headers={"Retry-After": str(int(info["retryAfter"]) + 1)},
- )
- try:
- # Get internal model name
- internalModelName = _getInternalModelName(request.modelName)
- isVision = _isVisionModel(internalModelName)
-
- # Validate request
- if isVision and not request.imageBase64:
- raise HTTPException(
- status_code=400,
- detail="Kein Bild übermittelt (erforderlich für Vision-Modelle)"
- )
-
- if not request.prompt:
- raise HTTPException(status_code=400, detail="Kein Prompt übermittelt")
-
- # Model-specific context lengths (reduced for RAM constraints)
- # Server has 31GB RAM + 22GB GPU - vision models need smaller context
- modelContextLengths = {
- "qwen2.5:7b": 8192, # Text model - 8K context
- "qwen2.5vl:7b": 4096, # Vision model - 4K context (images use lots of RAM)
- "granite3.2-vision": 4096, # Vision model - 4K context
- "granite3.2-vision:latest": 4096,
- "deepseek-ocr": 4096, # OCR model - 4K context
- "deepseek-ocr:latest": 4096,
- }
- numCtx = modelContextLengths.get(internalModelName, 4096)
-
- # Build request body with model-specific context window
- requestBody = {
- "model": internalModelName,
- "prompt": request.prompt,
- "stream": False,
- "options": {
- "num_ctx": numCtx
- }
- }
-
- if request.imageBase64:
- requestBody["images"] = [request.imageBase64]
-
- # Call Ollama API
- async with httpx.AsyncClient(timeout=3600.0) as client: # 60 min timeout
- response = await client.post(
- f"{CONFIG['ollamaUrl']}/api/generate",
- json=requestBody
- )
-
- if response.status_code == 404:
- raise HTTPException(
- status_code=404,
- detail=f'Modell "{internalModelName}" nicht gefunden. Bitte installieren mit: ollama pull {internalModelName}'
- )
-
- if response.status_code != 200:
- raise HTTPException(
- status_code=response.status_code,
- detail=f"Ollama API Fehler: {response.status_code} - {response.text[:200]}"
- )
-
- responseData = response.json()
- responseText = responseData.get("response", "")
-
- # Try to extract JSON from response
- extractedData = None
- jsonMatch = re.search(r"\{[\s\S]*\}", responseText)
-
- if jsonMatch:
- try:
- extractedData = json.loads(jsonMatch.group())
- except json.JSONDecodeError:
- extractedData = None
-
- # Wrap plain text response in JSON object
- if extractedData is None:
- extractedData = {"response": responseText.strip()}
-
- return AnalyzeResponse(
- success=True,
- data=extractedData,
- rawResponse=responseText
- )
-
- except httpx.TimeoutException:
- return AnalyzeResponse(
- success=False,
- error="Zeitüberschreitung bei der Ollama API"
- )
- except httpx.ConnectError:
- return AnalyzeResponse(
- success=False,
- error="Verbindung zu Ollama fehlgeschlagen. Ist Ollama gestartet?"
- )
- except HTTPException:
- raise
- except Exception as e:
- logger.error(f"Error analyzing document: {e}")
- return AnalyzeResponse(
- success=False,
- error=f"Unerwarteter Fehler: {str(e)}"
- )
-
-@app.post("/api/pdf/extract", tags=["PDF"])
-async def _extractPdfImages(request: PdfExtractRequest):
- """
- Extract images from a PDF.
-
- No API key required - this endpoint is for local test UI only,
- not used by gateway (gateway sends images directly).
- """
- if not PDF_SUPPORT:
- raise HTTPException(
- status_code=501,
- detail="PDF-Support nicht verfügbar. Bitte PyMuPDF installieren: pip install pymupdf"
- )
-
- try:
- pdfBytes = base64.b64decode(request.pdfBase64)
-
- if request.page is not None:
- # Extract single page
- result = _renderPdfPageAsImage(pdfBytes, request.page - 1)
- return {"success": True, "image": result}
- else:
- # Extract all pages (max 5)
- images = _extractImagesFromPdf(pdfBytes, maxPages=5)
- return {
- "success": True,
- "images": images,
- "totalExtracted": len(images)
- }
-
- except Exception as e:
- raise HTTPException(
- status_code=500,
- detail=f"PDF-Verarbeitungsfehler: {str(e)}"
- )
-
-# ============================================================================
-# Web UI Routes (Optional - for direct browser access)
-# ============================================================================
-
-@app.get("/", response_class=HTMLResponse, tags=["Web UI"])
-async def _index(request: Request):
- """Main page with document scanner UI."""
- return templates.TemplateResponse("index.html", {"request": request})
-
-@app.get("/login", response_class=HTMLResponse, tags=["Web UI"])
-async def _loginPage(request: Request):
- """Login page."""
- return templates.TemplateResponse("login.html", {"request": request})
-
-@app.get("/logout", response_class=HTMLResponse, tags=["Web UI"])
-async def _logout(request: Request):
- """Logout - redirect to login page."""
- from starlette.responses import RedirectResponse
- return RedirectResponse(url="/login", status_code=302)
# ============================================================================
# Main
@@ -942,7 +115,7 @@ async def _logout(request: Request):
if __name__ == "__main__":
import uvicorn
-
+
print("\n" + "=" * 60)
print(" Private-LLM Service - KI-Dokumentenanalyse")
print(" Powered by PowerOn")
@@ -952,5 +125,5 @@ if __name__ == "__main__":
print(f" Ollama URL: {CONFIG['ollamaUrl']}")
print("\n Drücke Ctrl+C zum Beenden")
print("=" * 60 + "\n")
-
+
uvicorn.run(app, host="0.0.0.0", port=5000)
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..749f612
--- /dev/null
+++ b/config.py
@@ -0,0 +1,435 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Shared configuration, models, helpers, and auth for the Private-LLM service."""
+
+import os
+import base64
+import json
+import re
+import logging
+import time
+import uuid
+from collections import defaultdict
+from typing import Optional, List, Dict, Any
+
+from fastapi import HTTPException, Header, Depends
+from pydantic import BaseModel, Field
+
+# PDF Support
+try:
+ import fitz # PyMuPDF
+ PDF_SUPPORT = True
+except ImportError:
+ PDF_SUPPORT = False
+
+logger = logging.getLogger(__name__)
+
+
+# ============================================================================
+# Configuration
+# ============================================================================
+
+def _loadConfig() -> Dict[str, Any]:
+ """Load configuration from config.ini file."""
+ configPath = os.path.join(os.path.dirname(__file__), "config.ini")
+ config = {
+ "apiKey": None,
+ "cursorApiKey": None,
+ "ollamaUrl": "http://localhost:11434",
+ "authUsername": "poweron",
+ "authPassword": "poweron",
+ "secretKey": "poweron-secret-key-change-in-production",
+ "rateLimitRequestsPerMinute": 60,
+ "rateLimitBurstSize": 10,
+ }
+
+ if os.path.exists(configPath):
+ try:
+ with open(configPath, "r") as f:
+ for line in f:
+ line = line.strip()
+ if not line or line.startswith("#"):
+ continue
+ if "=" in line:
+ key, value = line.split("=", 1)
+ key = key.strip()
+ value = value.strip()
+
+ if key == "PRIVATE_LLM_API_KEY":
+ config["apiKey"] = value
+ elif key == "CURSOR_API_KEY":
+ config["cursorApiKey"] = value
+ elif key == "OLLAMA_URL":
+ config["ollamaUrl"] = value
+ elif key == "AUTH_USERNAME":
+ config["authUsername"] = value
+ elif key == "AUTH_PASSWORD":
+ config["authPassword"] = value
+ elif key == "SECRET_KEY":
+ config["secretKey"] = value
+ elif key == "RATE_LIMIT_REQUESTS_PER_MINUTE":
+ config["rateLimitRequestsPerMinute"] = int(value)
+ elif key == "RATE_LIMIT_BURST_SIZE":
+ config["rateLimitBurstSize"] = int(value)
+ except Exception as e:
+ logger.warning(f"Error loading config.ini: {e}")
+
+ config["apiKey"] = os.environ.get("PRIVATE_LLM_API_KEY", config["apiKey"])
+ config["cursorApiKey"] = os.environ.get("CURSOR_API_KEY", config["cursorApiKey"])
+ config["ollamaUrl"] = os.environ.get("OLLAMA_URL", config["ollamaUrl"])
+ config["authUsername"] = os.environ.get("AUTH_USERNAME", config["authUsername"])
+ config["authPassword"] = os.environ.get("AUTH_PASSWORD", config["authPassword"])
+ config["secretKey"] = os.environ.get("SECRET_KEY", config["secretKey"])
+ config["rateLimitRequestsPerMinute"] = int(os.environ.get("RATE_LIMIT_REQUESTS_PER_MINUTE", config["rateLimitRequestsPerMinute"]))
+ config["rateLimitBurstSize"] = int(os.environ.get("RATE_LIMIT_BURST_SIZE", config["rateLimitBurstSize"]))
+
+ return config
+
+
+CONFIG = _loadConfig()
+
+
+# ============================================================================
+# Rate Limiting (Token Bucket Algorithm)
+# ============================================================================
+
+class RateLimiter:
+ """Token bucket rate limiter with per-API-key tracking."""
+
+ def __init__(self, requestsPerMinute: int = 60, burstSize: int = 10):
+ self.requestsPerMinute = requestsPerMinute
+ self.burstSize = burstSize
+ self.tokensPerSecond = requestsPerMinute / 60.0
+ self._buckets: Dict[str, Dict[str, float]] = defaultdict(
+ lambda: {"tokens": burstSize, "lastUpdate": time.time()}
+ )
+
+ def _refillTokens(self, bucket: Dict[str, float]) -> None:
+ now = time.time()
+ elapsed = now - bucket["lastUpdate"]
+ bucket["tokens"] = min(
+ self.burstSize,
+ bucket["tokens"] + elapsed * self.tokensPerSecond
+ )
+ bucket["lastUpdate"] = now
+
+ def isAllowed(self, apiKey: str) -> tuple[bool, Dict[str, Any]]:
+ bucket = self._buckets[apiKey]
+ self._refillTokens(bucket)
+
+ if bucket["tokens"] >= 1.0:
+ bucket["tokens"] -= 1.0
+ return True, {
+ "remaining": int(bucket["tokens"]),
+ "limit": self.requestsPerMinute,
+ "resetSeconds": 60
+ }
+ else:
+ retryAfter = (1.0 - bucket["tokens"]) / self.tokensPerSecond
+ return False, {
+ "remaining": 0,
+ "limit": self.requestsPerMinute,
+ "retryAfter": round(retryAfter, 1),
+ "resetSeconds": 60
+ }
+
+ def cleanup(self, maxAgeSeconds: int = 3600) -> int:
+ now = time.time()
+ staleKeys = [
+ key for key, bucket in self._buckets.items()
+ if now - bucket["lastUpdate"] > maxAgeSeconds
+ ]
+ for key in staleKeys:
+ del self._buckets[key]
+ return len(staleKeys)
+
+
+rateLimiter = RateLimiter(
+ requestsPerMinute=CONFIG["rateLimitRequestsPerMinute"],
+ burstSize=CONFIG["rateLimitBurstSize"]
+)
+
+# ============================================================================
+# Model Mapping
+# ============================================================================
+
+MODEL_MAPPING = {
+ "poweron-text-general": "qwen2.5:7b",
+ "poweron-vision-general": "qwen2.5vl:7b",
+ "poweron-vision-deep": "granite3.2-vision",
+}
+
+INTERNAL_TO_EXTERNAL = {v: k for k, v in MODEL_MAPPING.items()}
+
+
+# ============================================================================
+# Request/Response Models
+# ============================================================================
+
+class AnalyzeRequest(BaseModel):
+ imageBase64: Optional[str] = Field(default=None, description="Base64 encoded image")
+ prompt: str = Field(description="Analysis prompt")
+ modelName: str = Field(default="poweron-vision-general", description="Model to use")
+
+
+class AnalyzeResponse(BaseModel):
+ success: bool = Field(description="Whether the analysis was successful")
+ data: Optional[Dict[str, Any]] = Field(default=None, description="Extracted data")
+ rawResponse: Optional[str] = Field(default=None, description="Raw model response")
+ error: Optional[str] = Field(default=None, description="Error message if failed")
+
+
+class PdfExtractRequest(BaseModel):
+ pdfBase64: str = Field(description="Base64 encoded PDF")
+ page: Optional[int] = Field(default=None, description="Specific page number (1-indexed)")
+
+
+class ModelInfo(BaseModel):
+ name: str = Field(description="External model name")
+ internalName: str = Field(description="Internal Ollama model name")
+ isVision: bool = Field(description="Whether it's a vision model")
+ pricePerCall: float = Field(description="Price per call in CHF")
+
+
+class HealthResponse(BaseModel):
+ status: str
+ service: str
+ pdfSupport: bool
+ ollamaConnected: bool
+
+
+class OllamaStatusResponse(BaseModel):
+ connected: bool
+ models: Optional[List[str]] = None
+ visionModels: Optional[List[str]] = None
+ totalModels: Optional[int] = None
+ error: Optional[str] = None
+
+
+class OpenAiModelInfo(BaseModel):
+ id: str
+ object: str = "model"
+ created: int
+ ownedBy: str = Field(default="poweron", alias="owned_by")
+
+
+class OpenAiModelsResponse(BaseModel):
+ object: str = "list"
+ data: List[OpenAiModelInfo]
+
+
+class OpenAiChatMessage(BaseModel):
+ role: str
+ content: Any
+
+
+class OpenAiChatCompletionRequest(BaseModel):
+ model: str
+ messages: List[OpenAiChatMessage]
+ stream: Optional[bool] = False
+ maxTokens: Optional[int] = Field(default=None, alias="max_tokens")
+ temperature: Optional[float] = None
+
+
+class OpenAiChatCompletionChoice(BaseModel):
+ index: int
+ message: OpenAiChatMessage
+ finishReason: str = Field(default="stop", alias="finish_reason")
+
+
+class OpenAiChatCompletionUsage(BaseModel):
+ promptTokens: int = Field(default=0, alias="prompt_tokens")
+ completionTokens: int = Field(default=0, alias="completion_tokens")
+ totalTokens: int = Field(default=0, alias="total_tokens")
+
+
+class OpenAiChatCompletionResponse(BaseModel):
+ id: str
+ object: str = "chat.completion"
+ created: int
+ model: str
+ choices: List[OpenAiChatCompletionChoice]
+ usage: OpenAiChatCompletionUsage
+
+
+# ============================================================================
+# Helper Functions
+# ============================================================================
+
+def _isVisionModel(modelName: str) -> bool:
+ if not modelName:
+ return False
+ modelLower = modelName.lower()
+ visionIndicators = ["vision", "vl", "llava", "bakllava", "granite"]
+ return any(indicator in modelLower for indicator in visionIndicators)
+
+
+def _getInternalModelName(externalName: str) -> str:
+ return MODEL_MAPPING.get(externalName, externalName)
+
+
+def _getExternalModelName(internalName: str) -> str:
+ return INTERNAL_TO_EXTERNAL.get(internalName, internalName)
+
+
+def _contentToText(content: Any) -> str:
+ """Normalize OpenAI message content into plain text."""
+ if content is None:
+ return ""
+ if isinstance(content, str):
+ return content
+ if isinstance(content, list):
+ textParts = []
+ for part in content:
+ if isinstance(part, str):
+ textParts.append(part)
+ continue
+ if isinstance(part, dict):
+ partText = part.get("text")
+ if isinstance(partText, str):
+ textParts.append(partText)
+ return "\n".join([part for part in textParts if part.strip()])
+ if isinstance(content, dict):
+ contentText = content.get("text")
+ if isinstance(contentText, str):
+ return contentText
+ return str(content)
+
+
+def _messagesToPrompt(messages: List[OpenAiChatMessage]) -> str:
+ """Convert OpenAI chat messages to a single prompt for Ollama generate."""
+ promptLines = []
+ for message in messages:
+ normalizedText = _contentToText(message.content).strip()
+ if not normalizedText:
+ continue
+ promptLines.append(f"{message.role}: {normalizedText}")
+
+ if not promptLines:
+ return ""
+
+ promptLines.append("assistant:")
+ return "\n\n".join(promptLines)
+
+
+# ============================================================================
+# PDF Helper Functions
+# ============================================================================
+
+def _extractImagesFromPdf(pdfBytes: bytes, maxPages: int = 5) -> List[Dict[str, Any]]:
+ if not PDF_SUPPORT:
+ raise Exception("PDF-Support nicht verfügbar. Bitte PyMuPDF installieren.")
+
+ images = []
+ doc = fitz.open(stream=pdfBytes, filetype="pdf")
+ numPages = min(len(doc), maxPages)
+
+ for pageNum in range(numPages):
+ page = doc[pageNum]
+ mat = fitz.Matrix(2.0, 2.0)
+ pix = page.get_pixmap(matrix=mat)
+ imgBytes = pix.tobytes("png")
+ imgBase64 = base64.b64encode(imgBytes).decode("utf-8")
+
+ images.append({
+ "page": pageNum + 1,
+ "base64": imgBase64,
+ "width": pix.width,
+ "height": pix.height
+ })
+
+ doc.close()
+ return images
+
+
+def _renderPdfPageAsImage(pdfBytes: bytes, pageNum: int = 0, zoom: float = 2.0) -> Dict[str, Any]:
+ if not PDF_SUPPORT:
+ raise Exception("PDF-Support nicht verfügbar.")
+
+ doc = fitz.open(stream=pdfBytes, filetype="pdf")
+
+ if pageNum >= len(doc):
+ pageNum = len(doc) - 1
+
+ page = doc[pageNum]
+ mat = fitz.Matrix(zoom, zoom)
+ pix = page.get_pixmap(matrix=mat)
+ imgBytes = pix.tobytes("png")
+ imgBase64 = base64.b64encode(imgBytes).decode("utf-8")
+
+ result = {
+ "base64": imgBase64,
+ "width": pix.width,
+ "height": pix.height,
+ "page": pageNum + 1,
+ "totalPages": len(doc)
+ }
+
+ doc.close()
+ return result
+
+
+# ============================================================================
+# Authentication Dependencies
+# ============================================================================
+
+async def _verifyApiKey(xApiKey: Optional[str] = Header(None, alias="X-API-Key")) -> str:
+ """Verify the API key from header and return it for rate limiting."""
+ if not CONFIG["apiKey"]:
+ logger.warning("No API key configured - running in development mode")
+ return "dev-mode"
+
+ if not xApiKey:
+ raise HTTPException(status_code=401, detail="API key required")
+
+ if xApiKey != CONFIG["apiKey"]:
+ raise HTTPException(status_code=401, detail="Invalid API key")
+
+ return xApiKey
+
+
+async def _verifyCursorApiKey(authorization: Optional[str] = Header(None)) -> str:
+ """Verify Bearer token for Cursor OpenAI-compatible endpoints."""
+ expectedApiKey = CONFIG.get("cursorApiKey")
+ if not expectedApiKey:
+ raise HTTPException(
+ status_code=503,
+ detail="Cursor API key not configured on server"
+ )
+
+ if not authorization:
+ raise HTTPException(status_code=401, detail="Authorization header required")
+
+ if not authorization.startswith("Bearer "):
+ raise HTTPException(status_code=401, detail="Bearer token required")
+
+ providedApiKey = authorization[len("Bearer "):].strip()
+ if providedApiKey != expectedApiKey:
+ raise HTTPException(status_code=401, detail="Invalid API key")
+
+ return providedApiKey
+
+
+async def _checkRateLimit(apiKey: str = Depends(_verifyApiKey)) -> str:
+ """Check rate limit for the authenticated API key."""
+ allowed, info = rateLimiter.isAllowed(apiKey)
+
+ if not allowed:
+ raise HTTPException(
+ status_code=429,
+ detail={
+ "error": "Rate limit exceeded",
+ "message": f"Too many requests. Please retry after {info['retryAfter']} seconds.",
+ "retryAfter": info["retryAfter"],
+ "limit": info["limit"],
+ "remaining": info["remaining"]
+ },
+ headers={
+ "Retry-After": str(int(info["retryAfter"])),
+ "X-RateLimit-Limit": str(info["limit"]),
+ "X-RateLimit-Remaining": str(info["remaining"]),
+ "X-RateLimit-Reset": str(info["resetSeconds"])
+ }
+ )
+
+ return apiKey
diff --git a/routeApi.py b/routeApi.py
new file mode 100644
index 0000000..4278355
--- /dev/null
+++ b/routeApi.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""API routes for Private-LLM: health, models, analyze, PDF extract, Ollama status."""
+
+import base64
+import json
+import re
+import logging
+from typing import Optional, List
+
+import httpx
+from fastapi import APIRouter, HTTPException, Depends, Header
+
+from config import (
+ CONFIG, MODEL_MAPPING, PDF_SUPPORT,
+ rateLimiter,
+ _isVisionModel, _getInternalModelName,
+ _extractImagesFromPdf, _renderPdfPageAsImage,
+ _verifyApiKey,
+ AnalyzeRequest, AnalyzeResponse,
+ PdfExtractRequest, ModelInfo,
+ HealthResponse, OllamaStatusResponse,
+)
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(tags=["API"])
+
+
+@router.get("/api/health", response_model=HealthResponse, tags=["System"])
+async def _healthCheck():
+ """Health check endpoint."""
+ ollamaConnected = False
+ try:
+ async with httpx.AsyncClient(timeout=5.0) as client:
+ response = await client.get(f"{CONFIG['ollamaUrl']}/api/tags")
+ ollamaConnected = response.status_code == 200
+ except Exception:
+ pass
+
+ return HealthResponse(
+ status="ok",
+ service="private-llm",
+ pdfSupport=PDF_SUPPORT,
+ ollamaConnected=ollamaConnected
+ )
+
+
+@router.get("/api/models", response_model=List[ModelInfo], tags=["Models"])
+async def _listModels(authenticated: bool = Depends(_verifyApiKey)):
+ """List available models with pricing."""
+ models = []
+ for externalName, internalName in MODEL_MAPPING.items():
+ isVision = _isVisionModel(internalName)
+ pricePerCall = 0.10 if isVision else 0.01
+
+ models.append(ModelInfo(
+ name=externalName,
+ internalName=internalName,
+ isVision=isVision,
+ pricePerCall=pricePerCall
+ ))
+
+ return models
+
+
+@router.get("/api/ollama/status", response_model=OllamaStatusResponse, tags=["System"])
+async def _ollamaStatus():
+ """Check Ollama connection status and list available models."""
+ try:
+ async with httpx.AsyncClient(timeout=10.0) as client:
+ response = await client.get(f"{CONFIG['ollamaUrl']}/api/tags")
+
+ if response.status_code != 200:
+ return OllamaStatusResponse(
+ connected=False,
+ error=f"Ollama responded with status {response.status_code}"
+ )
+
+ data = response.json()
+ models = [m.get("name", "") for m in data.get("models", [])]
+ visionModels = [m for m in models if _isVisionModel(m)]
+
+ return OllamaStatusResponse(
+ connected=True,
+ models=models,
+ visionModels=visionModels,
+ totalModels=len(models)
+ )
+
+ except httpx.ConnectError:
+ return OllamaStatusResponse(
+ connected=False,
+ error="Keine Verbindung zu Ollama. Ist Ollama gestartet?"
+ )
+ except Exception as e:
+ return OllamaStatusResponse(
+ connected=False,
+ error=str(e)
+ )
+
+
+@router.post("/api/analyze", response_model=AnalyzeResponse, tags=["AI"])
+async def _analyzeDocument(
+ request: AnalyzeRequest,
+ xApiKey: Optional[str] = Header(None, alias="X-API-Key")
+):
+ """
+ Analyze a document with AI Vision API.
+
+ Supports both vision models (with images) and text models (without images).
+
+ Authentication:
+ - Gateway calls: Must include X-API-Key header
+ - Test UI calls: No auth required (same-origin)
+
+ Rate limiting is applied when API key is provided.
+ """
+ if xApiKey:
+ if CONFIG["apiKey"] and xApiKey != CONFIG["apiKey"]:
+ raise HTTPException(status_code=401, detail="Invalid API key")
+ allowed, info = rateLimiter.isAllowed(xApiKey)
+ if not allowed:
+ raise HTTPException(
+ status_code=429,
+ detail=f"Rate limit exceeded. Retry after {info['retryAfter']} seconds.",
+ headers={"Retry-After": str(int(info["retryAfter"]) + 1)},
+ )
+ try:
+ internalModelName = _getInternalModelName(request.modelName)
+ isVision = _isVisionModel(internalModelName)
+
+ if isVision and not request.imageBase64:
+ raise HTTPException(
+ status_code=400,
+ detail="Kein Bild übermittelt (erforderlich für Vision-Modelle)"
+ )
+
+ if not request.prompt:
+ raise HTTPException(status_code=400, detail="Kein Prompt übermittelt")
+
+ # Server has 31GB RAM + 22GB GPU - vision models need smaller context
+ modelContextLengths = {
+ "qwen2.5:7b": 8192,
+ "qwen2.5vl:7b": 4096,
+ "granite3.2-vision": 4096,
+ "granite3.2-vision:latest": 4096,
+ "deepseek-ocr": 4096,
+ "deepseek-ocr:latest": 4096,
+ }
+ numCtx = modelContextLengths.get(internalModelName, 4096)
+
+ requestBody = {
+ "model": internalModelName,
+ "prompt": request.prompt,
+ "stream": False,
+ "options": {
+ "num_ctx": numCtx
+ }
+ }
+
+ if request.imageBase64:
+ requestBody["images"] = [request.imageBase64]
+
+ async with httpx.AsyncClient(timeout=3600.0) as client:
+ response = await client.post(
+ f"{CONFIG['ollamaUrl']}/api/generate",
+ json=requestBody
+ )
+
+ if response.status_code == 404:
+ raise HTTPException(
+ status_code=404,
+ detail=f'Modell "{internalModelName}" nicht gefunden. Bitte installieren mit: ollama pull {internalModelName}'
+ )
+
+ if response.status_code != 200:
+ raise HTTPException(
+ status_code=response.status_code,
+ detail=f"Ollama API Fehler: {response.status_code} - {response.text[:200]}"
+ )
+
+ responseData = response.json()
+ responseText = responseData.get("response", "")
+
+ extractedData = None
+ jsonMatch = re.search(r"\{[\s\S]*\}", responseText)
+
+ if jsonMatch:
+ try:
+ extractedData = json.loads(jsonMatch.group())
+ except json.JSONDecodeError:
+ extractedData = None
+
+ if extractedData is None:
+ extractedData = {"response": responseText.strip()}
+
+ return AnalyzeResponse(
+ success=True,
+ data=extractedData,
+ rawResponse=responseText
+ )
+
+ except httpx.TimeoutException:
+ return AnalyzeResponse(
+ success=False,
+ error="Zeitüberschreitung bei der Ollama API"
+ )
+ except httpx.ConnectError:
+ return AnalyzeResponse(
+ success=False,
+ error="Verbindung zu Ollama fehlgeschlagen. Ist Ollama gestartet?"
+ )
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Error analyzing document: {e}")
+ return AnalyzeResponse(
+ success=False,
+ error=f"Unerwarteter Fehler: {str(e)}"
+ )
+
+
+@router.post("/api/pdf/extract", tags=["PDF"])
+async def _extractPdfImages(request: PdfExtractRequest):
+ """
+ Extract images from a PDF.
+
+ No API key required - this endpoint is for local test UI only,
+ not used by gateway (gateway sends images directly).
+ """
+ if not PDF_SUPPORT:
+ raise HTTPException(
+ status_code=501,
+ detail="PDF-Support nicht verfügbar. Bitte PyMuPDF installieren: pip install pymupdf"
+ )
+
+ try:
+ pdfBytes = base64.b64decode(request.pdfBase64)
+
+ if request.page is not None:
+ result = _renderPdfPageAsImage(pdfBytes, request.page - 1)
+ return {"success": True, "image": result}
+ else:
+ images = _extractImagesFromPdf(pdfBytes, maxPages=5)
+ return {
+ "success": True,
+ "images": images,
+ "totalExtracted": len(images)
+ }
+
+ except Exception as e:
+ raise HTTPException(
+ status_code=500,
+ detail=f"PDF-Verarbeitungsfehler: {str(e)}"
+ )
diff --git a/routeOpenAi.py b/routeOpenAi.py
new file mode 100644
index 0000000..eb46319
--- /dev/null
+++ b/routeOpenAi.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""OpenAI-compatible routes for Cursor integration (/v1/models, /v1/chat/completions)."""
+
+import time
+import uuid
+import logging
+
+import httpx
+from fastapi import APIRouter, HTTPException, Depends
+
+from config import (
+ CONFIG, MODEL_MAPPING,
+ rateLimiter,
+ _isVisionModel, _getInternalModelName, _messagesToPrompt,
+ _verifyCursorApiKey,
+ OpenAiChatCompletionRequest, OpenAiChatCompletionResponse,
+ OpenAiChatCompletionChoice, OpenAiChatCompletionUsage,
+ OpenAiChatMessage, OpenAiModelInfo, OpenAiModelsResponse,
+)
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(tags=["OpenAI Compatible"])
+
+
+@router.get("/v1/models", response_model=OpenAiModelsResponse)
+async def _listOpenAiModels(cursorApiKey: str = Depends(_verifyCursorApiKey)):
+ """OpenAI-compatible models endpoint for Cursor."""
+ createdAt = int(time.time())
+ modelData = []
+ for externalName in MODEL_MAPPING.keys():
+ modelData.append(
+ OpenAiModelInfo(
+ id=externalName,
+ created=createdAt
+ )
+ )
+ return OpenAiModelsResponse(data=modelData)
+
+
+@router.post(
+ "/v1/chat/completions",
+ response_model=OpenAiChatCompletionResponse,
+)
+async def _openAiChatCompletions(
+ request: OpenAiChatCompletionRequest,
+ cursorApiKey: str = Depends(_verifyCursorApiKey)
+):
+ """OpenAI-compatible chat completions endpoint for Cursor."""
+ if request.stream:
+ raise HTTPException(
+ status_code=400,
+ detail="Streaming is not supported by this endpoint"
+ )
+
+ allowed, info = rateLimiter.isAllowed(f"cursor:{cursorApiKey}")
+ if not allowed:
+ raise HTTPException(
+ status_code=429,
+ detail={
+ "error": "Rate limit exceeded",
+ "message": f"Too many requests. Please retry after {info['retryAfter']} seconds.",
+ "retryAfter": info["retryAfter"],
+ "limit": info["limit"],
+ "remaining": info["remaining"]
+ },
+ headers={
+ "Retry-After": str(int(info["retryAfter"])),
+ "X-RateLimit-Limit": str(info["limit"]),
+ "X-RateLimit-Remaining": str(info["remaining"]),
+ "X-RateLimit-Reset": str(info["resetSeconds"])
+ }
+ )
+
+ promptText = _messagesToPrompt(request.messages).strip()
+ if not promptText:
+ raise HTTPException(status_code=400, detail="messages must contain text content")
+
+ internalModelName = _getInternalModelName(request.model)
+ if _isVisionModel(internalModelName):
+ raise HTTPException(
+ status_code=400,
+ detail="Vision models are not supported on /v1/chat/completions"
+ )
+
+ requestOptions = {
+ "num_ctx": 8192
+ }
+ if request.temperature is not None:
+ requestOptions["temperature"] = request.temperature
+ if request.maxTokens is not None:
+ requestOptions["num_predict"] = request.maxTokens
+
+ requestBody = {
+ "model": internalModelName,
+ "prompt": promptText,
+ "stream": False,
+ "options": requestOptions
+ }
+
+ try:
+ async with httpx.AsyncClient(timeout=3600.0) as client:
+ response = await client.post(
+ f"{CONFIG['ollamaUrl']}/api/generate",
+ json=requestBody
+ )
+
+ if response.status_code == 404:
+ raise HTTPException(
+ status_code=404,
+ detail=f'Model "{request.model}" not found'
+ )
+ if response.status_code != 200:
+ raise HTTPException(
+ status_code=response.status_code,
+ detail=f"Ollama API error: {response.status_code} - {response.text[:200]}"
+ )
+
+ responseData = response.json()
+ responseText = responseData.get("response", "").strip()
+ promptEvalCount = int(responseData.get("prompt_eval_count", 0))
+ evalCount = int(responseData.get("eval_count", 0))
+
+ return OpenAiChatCompletionResponse(
+ id=f"chatcmpl-{uuid.uuid4().hex}",
+ created=int(time.time()),
+ model=request.model,
+ choices=[
+ OpenAiChatCompletionChoice(
+ index=0,
+ message=OpenAiChatMessage(role="assistant", content=responseText)
+ )
+ ],
+ usage=OpenAiChatCompletionUsage(
+ promptTokens=promptEvalCount,
+ completionTokens=evalCount,
+ totalTokens=promptEvalCount + evalCount
+ )
+ )
+
+ except httpx.TimeoutException:
+ raise HTTPException(status_code=504, detail="Upstream timeout (Ollama)")
+ except httpx.ConnectError:
+ raise HTTPException(status_code=503, detail="Cannot connect to Ollama upstream")
diff --git a/routeWeb.py b/routeWeb.py
new file mode 100644
index 0000000..c3ddc4a
--- /dev/null
+++ b/routeWeb.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Web UI routes for the Private-LLM test interface."""
+
+import logging
+
+from fastapi import APIRouter, Request
+from fastapi.responses import HTMLResponse
+from fastapi.templating import Jinja2Templates
+from starlette.responses import RedirectResponse
+
+logger = logging.getLogger(__name__)
+
+templates = Jinja2Templates(directory="templates")
+
+router = APIRouter(tags=["Web UI"])
+
+
+@router.get("/", response_class=HTMLResponse)
+async def _index(request: Request):
+ """Main page with document scanner UI."""
+ return templates.TemplateResponse("index.html", {"request": request})
+
+
+@router.get("/login", response_class=HTMLResponse)
+async def _loginPage(request: Request):
+ """Login page."""
+ return templates.TemplateResponse("login.html", {"request": request})
+
+
+@router.get("/logout", response_class=HTMLResponse)
+async def _logout(request: Request):
+ """Logout - redirect to login page."""
+ return RedirectResponse(url="/login", status_code=302)