integrated privateLLM

2026-02-06 10:27:06 +01:00 · 2026-02-06 10:27:06 +01:00 · a04bee5008
commit a04bee5008
parent 0313821f59
9 changed files with 671 additions and 307 deletions
--- a/app.py
+++ b/app.py
@ -1,17 +1,33 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
 """
-Belegscanner - KI-Dokumentenanalyse
-Python Flask Web App mit CORS-Unterstützung und Poweron Design
+Private-LLM Service - FastAPI Web App
+Provides AI model endpoints for OCR and Vision processing via Ollama.
+
+Models exposed:
+- poweron-ocr-general (deepseek)
+- poweron-vision-general (qwen2.5)
+- poweron-vision-deep (granite3.2)
 """

-from flask import Flask, render_template, request, jsonify, session, redirect, url_for
-from flask_cors import CORS
-from functools import wraps
-import requests
+import os
+import sys
 import base64
 import json
 import re
-import io
-import os
+import logging
+import time
+from collections import defaultdict
+from typing import Optional, List, Dict, Any
+from contextlib import asynccontextmanager
+
+from fastapi import FastAPI, HTTPException, Depends, Header, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from pydantic import BaseModel, Field
+import httpx

 # PDF Support
 try:
@ -22,79 +38,243 @@ except ImportError:
    print("WARNUNG: PyMuPDF nicht installiert. PDF-Support deaktiviert.")
    print("Installieren mit: pip install pymupdf")

-app = Flask(__name__)
-app.secret_key = os.environ.get('SECRET_KEY', 'poweron-secret-key-change-in-production')
-CORS(app, supports_credentials=True)  # CORS für alle Routen aktivieren
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)

 # ============================================================================
-# Authentication
+# Configuration
 # ============================================================================

-# Einfache Credentials (für minimalen Schutz)
-AUTH_USERNAME = os.environ.get('AUTH_USERNAME', 'poweron')
-AUTH_PASSWORD = os.environ.get('AUTH_PASSWORD', 'poweron')
+def _loadConfig() -> Dict[str, Any]:
+    """Load configuration from config.ini file."""
+    configPath = os.path.join(os.path.dirname(__file__), "config.ini")
+    config = {
+        "apiKey": None,
+        "ollamaUrl": "http://localhost:11434",
+        "authUsername": "poweron",
+        "authPassword": "poweron",
+        "secretKey": "poweron-secret-key-change-in-production",
+        "rateLimitRequestsPerMinute": 60,
+        "rateLimitBurstSize": 10,
+    }
+    
+    if os.path.exists(configPath):
+        try:
+            with open(configPath, "r") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line or line.startswith("#"):
+                        continue
+                    if "=" in line:
+                        key, value = line.split("=", 1)
+                        key = key.strip()
+                        value = value.strip()
+                        
+                        # Map config keys
+                        if key == "PRIVATE_LLM_API_KEY":
+                            config["apiKey"] = value
+                        elif key == "OLLAMA_URL":
+                            config["ollamaUrl"] = value
+                        elif key == "AUTH_USERNAME":
+                            config["authUsername"] = value
+                        elif key == "AUTH_PASSWORD":
+                            config["authPassword"] = value
+                        elif key == "SECRET_KEY":
+                            config["secretKey"] = value
+                        elif key == "RATE_LIMIT_REQUESTS_PER_MINUTE":
+                            config["rateLimitRequestsPerMinute"] = int(value)
+                        elif key == "RATE_LIMIT_BURST_SIZE":
+                            config["rateLimitBurstSize"] = int(value)
+        except Exception as e:
+            logger.warning(f"Error loading config.ini: {e}")
+    
+    # Override with environment variables if set
+    config["apiKey"] = os.environ.get("PRIVATE_LLM_API_KEY", config["apiKey"])
+    config["ollamaUrl"] = os.environ.get("OLLAMA_URL", config["ollamaUrl"])
+    config["authUsername"] = os.environ.get("AUTH_USERNAME", config["authUsername"])
+    config["authPassword"] = os.environ.get("AUTH_PASSWORD", config["authPassword"])
+    config["secretKey"] = os.environ.get("SECRET_KEY", config["secretKey"])
+    config["rateLimitRequestsPerMinute"] = int(os.environ.get("RATE_LIMIT_REQUESTS_PER_MINUTE", config["rateLimitRequestsPerMinute"]))
+    config["rateLimitBurstSize"] = int(os.environ.get("RATE_LIMIT_BURST_SIZE", config["rateLimitBurstSize"]))
+    
+    return config
+
+CONFIG = _loadConfig()


-def _loginRequired(f):
-    """Decorator für geschützte Routen"""
-    @wraps(f)
-    def decorated_function(*args, **kwargs):
-        if not session.get('logged_in'):
-            # Bei API-Calls JSON zurückgeben, sonst redirect
-            if request.path.startswith('/api/'):
-                return jsonify({'error': 'Nicht autorisiert', 'login_required': True}), 401
-            return redirect(url_for('_login'))
-        return f(*args, **kwargs)
-    return decorated_function
+# ============================================================================
+# Rate Limiting (Token Bucket Algorithm)
+# ============================================================================

+class RateLimiter:
+    """
+    Token bucket rate limiter with per-API-key tracking.
+    
+    Each API key gets its own bucket. Tokens are added at a constant rate
+    (requestsPerMinute / 60 per second) up to a maximum burst size.
+    """
+    
+    def __init__(self, requestsPerMinute: int = 60, burstSize: int = 10):
+        self.requestsPerMinute = requestsPerMinute
+        self.burstSize = burstSize
+        self.tokensPerSecond = requestsPerMinute / 60.0
+        
+        # Track tokens and last update time per API key
+        # Format: {apiKey: {"tokens": float, "lastUpdate": float}}
+        self._buckets: Dict[str, Dict[str, float]] = defaultdict(
+            lambda: {"tokens": burstSize, "lastUpdate": time.time()}
+        )
+    
+    def _refillTokens(self, bucket: Dict[str, float]) -> None:
+        """Refill tokens based on elapsed time."""
+        now = time.time()
+        elapsed = now - bucket["lastUpdate"]
+        bucket["tokens"] = min(
+            self.burstSize,
+            bucket["tokens"] + elapsed * self.tokensPerSecond
+        )
+        bucket["lastUpdate"] = now
+    
+    def isAllowed(self, apiKey: str) -> tuple[bool, Dict[str, Any]]:
+        """
+        Check if a request is allowed and consume a token if so.
+        
+        Returns:
+            Tuple of (allowed: bool, info: dict with remaining tokens and retry_after)
+        """
+        bucket = self._buckets[apiKey]
+        self._refillTokens(bucket)
+        
+        if bucket["tokens"] >= 1.0:
+            bucket["tokens"] -= 1.0
+            return True, {
+                "remaining": int(bucket["tokens"]),
+                "limit": self.requestsPerMinute,
+                "resetSeconds": 60
+            }
+        else:
+            # Calculate when the next token will be available
+            retryAfter = (1.0 - bucket["tokens"]) / self.tokensPerSecond
+            return False, {
+                "remaining": 0,
+                "limit": self.requestsPerMinute,
+                "retryAfter": round(retryAfter, 1),
+                "resetSeconds": 60
+            }
+    
+    def cleanup(self, maxAgeSeconds: int = 3600) -> int:
+        """Remove stale buckets to prevent memory growth."""
+        now = time.time()
+        staleKeys = [
+            key for key, bucket in self._buckets.items()
+            if now - bucket["lastUpdate"] > maxAgeSeconds
+        ]
+        for key in staleKeys:
+            del self._buckets[key]
+        return len(staleKeys)
+
+
+# Global rate limiter instance
+rateLimiter = RateLimiter(
+    requestsPerMinute=CONFIG["rateLimitRequestsPerMinute"],
+    burstSize=CONFIG["rateLimitBurstSize"]
+)
+
+# Model mapping: external name -> internal Ollama model name
+# Production models (optimized for 32GB RAM server):
+# - deepseek-ocr: 3.34B params, 8K context, ~6.7GB RAM
+# - qwen2.5vl:7b: 8.29B params, 125K context, ~6GB RAM  
+# - granite3.2-vision: 2B params, 16K context, ~2.4GB RAM
+MODEL_MAPPING = {
+    "poweron-ocr-general": "deepseek-ocr",
+    "poweron-vision-general": "qwen2.5vl:7b",
+    "poweron-vision-deep": "granite3.2-vision",
+}
+
+# Reverse mapping for lookups
+INTERNAL_TO_EXTERNAL = {v: k for k, v in MODEL_MAPPING.items()}
+
+# ============================================================================
+# Request/Response Models
+# ============================================================================
+
+class AnalyzeRequest(BaseModel):
+    """Request model for document analysis."""
+    imageBase64: Optional[str] = Field(default=None, description="Base64 encoded image")
+    prompt: str = Field(description="Analysis prompt")
+    modelName: str = Field(default="poweron-vision-general", description="Model to use")
+
+class AnalyzeResponse(BaseModel):
+    """Response model for document analysis."""
+    success: bool = Field(description="Whether the analysis was successful")
+    data: Optional[Dict[str, Any]] = Field(default=None, description="Extracted data")
+    rawResponse: Optional[str] = Field(default=None, description="Raw model response")
+    error: Optional[str] = Field(default=None, description="Error message if failed")
+
+class PdfExtractRequest(BaseModel):
+    """Request model for PDF extraction."""
+    pdfBase64: str = Field(description="Base64 encoded PDF")
+    page: Optional[int] = Field(default=None, description="Specific page number (1-indexed)")
+
+class ModelInfo(BaseModel):
+    """Model information."""
+    name: str = Field(description="External model name")
+    internalName: str = Field(description="Internal Ollama model name")
+    isVision: bool = Field(description="Whether it's a vision model")
+    pricePerCall: float = Field(description="Price per call in CHF")
+
+class HealthResponse(BaseModel):
+    """Health check response."""
+    status: str
+    service: str
+    pdfSupport: bool
+    ollamaConnected: bool
+
+class OllamaStatusResponse(BaseModel):
+    """Ollama status response."""
+    connected: bool
+    models: Optional[List[str]] = None
+    visionModels: Optional[List[str]] = None
+    totalModels: Optional[int] = None
+    error: Optional[str] = None

 # ============================================================================
 # PDF Helper Functions
 # ============================================================================

-def _extractImagesFromPdf(pdfBytes, maxPages=5):
-    """
-    Extrahiert Bilder aus einem PDF.
-    Gibt eine Liste von Base64-kodierten Bildern zurück.
-    """
+def _extractImagesFromPdf(pdfBytes: bytes, maxPages: int = 5) -> List[Dict[str, Any]]:
+    """Extract images from a PDF."""
    if not PDF_SUPPORT:
        raise Exception("PDF-Support nicht verfügbar. Bitte PyMuPDF installieren.")
    
    images = []
-    
-    # PDF öffnen
    doc = fitz.open(stream=pdfBytes, filetype="pdf")
-    
-    # Anzahl der Seiten begrenzen
    numPages = min(len(doc), maxPages)
    
    for pageNum in range(numPages):
        page = doc[pageNum]
-        
-        # Seite als Bild rendern (höhere Auflösung für bessere OCR)
-        mat = fitz.Matrix(2.0, 2.0)  # 2x Zoom für bessere Qualität
+        mat = fitz.Matrix(2.0, 2.0)  # 2x Zoom for better quality
        pix = page.get_pixmap(matrix=mat)
-        
-        # In PNG konvertieren
        imgBytes = pix.tobytes("png")
-        imgBase64 = base64.b64encode(imgBytes).decode('utf-8')
+        imgBase64 = base64.b64encode(imgBytes).decode("utf-8")
        
        images.append({
-            'page': pageNum + 1,
-            'base64': imgBase64,
-            'width': pix.width,
-            'height': pix.height
+            "page": pageNum + 1,
+            "base64": imgBase64,
+            "width": pix.width,
+            "height": pix.height
        })
    
    doc.close()
-    
    return images

-
-def _renderPdfPageAsImage(pdfBytes, pageNum=0, zoom=2.0):
-    """
-    Rendert eine einzelne PDF-Seite als Bild.
-    """
+def _renderPdfPageAsImage(pdfBytes: bytes, pageNum: int = 0, zoom: float = 2.0) -> Dict[str, Any]:
+    """Render a single PDF page as an image."""
    if not PDF_SUPPORT:
        raise Exception("PDF-Support nicht verfügbar.")
    
@ -106,267 +286,399 @@ def _renderPdfPageAsImage(pdfBytes, pageNum=0, zoom=2.0):
    page = doc[pageNum]
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat)
-    
    imgBytes = pix.tobytes("png")
-    imgBase64 = base64.b64encode(imgBytes).decode('utf-8')
+    imgBase64 = base64.b64encode(imgBytes).decode("utf-8")
    
    result = {
-        'base64': imgBase64,
-        'width': pix.width,
-        'height': pix.height,
-        'page': pageNum + 1,
-        'totalPages': len(doc)
+        "base64": imgBase64,
+        "width": pix.width,
+        "height": pix.height,
+        "page": pageNum + 1,
+        "totalPages": len(doc)
    }
    
    doc.close()
-    
    return result

 # ============================================================================
 # Model Helper Functions
 # ============================================================================

-def _isVisionModel(modelName):
-    """
-    Prüft ob ein Modell ein Vision-Modell ist basierend auf Namenskonventionen.
-    Vision-Modelle enthalten typischerweise 'vision', 'vl', 'llava', 'bakllava' im Namen.
-    """
+def _isVisionModel(modelName: str) -> bool:
+    """Check if a model is a vision model based on naming conventions."""
    if not modelName:
        return False
    
    modelLower = modelName.lower()
-    visionIndicators = ['vision', 'vl', 'llava', 'bakllava']
+    visionIndicators = ["vision", "vl", "llava", "bakllava", "granite"]
    
    return any(indicator in modelLower for indicator in visionIndicators)

+def _getInternalModelName(externalName: str) -> str:
+    """Get the internal Ollama model name from external name."""
+    return MODEL_MAPPING.get(externalName, externalName)
+
+def _getExternalModelName(internalName: str) -> str:
+    """Get the external model name from internal Ollama name."""
+    return INTERNAL_TO_EXTERNAL.get(internalName, internalName)

 # ============================================================================
-# Routes
+# Authentication & Rate Limiting
 # ============================================================================

-@app.route('/login', methods=['GET', 'POST'])
-def _login():
-    """Login-Seite"""
-    error = None
-    if request.method == 'POST':
-        username = request.form.get('username', '')
-        password = request.form.get('password', '')
-        
-        if username == AUTH_USERNAME and password == AUTH_PASSWORD:
-            session['logged_in'] = True
-            session['username'] = username
-            return redirect(url_for('_index'))
-        else:
-            error = 'Ungültige Anmeldedaten'
+async def _verifyApiKey(xApiKey: Optional[str] = Header(None, alias="X-API-Key")) -> str:
+    """Verify the API key from header and return it for rate limiting."""
+    if not CONFIG["apiKey"]:
+        # No API key configured, allow all requests (development mode)
+        logger.warning("No API key configured - running in development mode")
+        return "dev-mode"
    
-    return render_template('login.html', error=error)
+    if not xApiKey:
+        raise HTTPException(status_code=401, detail="API key required")
+    
+    if xApiKey != CONFIG["apiKey"]:
+        raise HTTPException(status_code=401, detail="Invalid API key")
+    
+    return xApiKey


-@app.route('/logout')
-def _logout():
-    """Logout"""
-    session.clear()
-    return redirect(url_for('_login'))
+async def _checkRateLimit(apiKey: str = Depends(_verifyApiKey)) -> str:
+    """Check rate limit for the authenticated API key."""
+    allowed, info = rateLimiter.isAllowed(apiKey)
+    
+    if not allowed:
+        raise HTTPException(
+            status_code=429,
+            detail={
+                "error": "Rate limit exceeded",
+                "message": f"Too many requests. Please retry after {info['retryAfter']} seconds.",
+                "retryAfter": info["retryAfter"],
+                "limit": info["limit"],
+                "remaining": info["remaining"]
+            },
+            headers={
+                "Retry-After": str(int(info["retryAfter"])),
+                "X-RateLimit-Limit": str(info["limit"]),
+                "X-RateLimit-Remaining": str(info["remaining"]),
+                "X-RateLimit-Reset": str(info["resetSeconds"])
+            }
+        )
+    
+    return apiKey

+# ============================================================================
+# Application Lifecycle
+# ============================================================================

-@app.route('/')
-@_loginRequired
-def _index():
-    """Hauptseite mit dem Belegscanner UI"""
-    return render_template('index.html')
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan handler."""
+    logger.info("Private-LLM Service starting up...")
+    logger.info(f"Ollama URL: {CONFIG['ollamaUrl']}")
+    logger.info(f"API Key configured: {'Yes' if CONFIG['apiKey'] else 'No (development mode)'}")
+    logger.info(f"PDF Support: {'Enabled' if PDF_SUPPORT else 'Disabled'}")
+    yield
+    logger.info("Private-LLM Service shutting down...")

+# ============================================================================
+# FastAPI Application
+# ============================================================================

-@app.route('/api/analyze', methods=['POST'])
-@_loginRequired
-def _analyzeDocument():
+app = FastAPI(
+    title="PowerOn Private-LLM Service",
+    description="AI model endpoints for OCR and Vision processing",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+
+# CORS Configuration - Allow gateway instances
+ALLOWED_ORIGINS = [
+    "http://localhost:8000",
+    "http://localhost:8080",
+    "http://localhost:5000",
+    "http://127.0.0.1:8000",
+    "http://127.0.0.1:8080",
+    "http://127.0.0.1:5000",
+]
+
+# Add production origins
+PRODUCTION_PATTERNS = [
+    "poweron.swiss",
+    "poweron-center.net",
+]
+
+# Build full origins list with https variants
+for pattern in PRODUCTION_PATTERNS:
+    ALLOWED_ORIGINS.extend([
+        f"https://{pattern}",
+        f"https://www.{pattern}",
+        f"https://api.{pattern}",
+        f"https://gateway.{pattern}",
+        f"https://app.{pattern}",
+        f"https://nyla.{pattern}",
+        f"https://playground.{pattern}",
+    ])
+
+# Allow all subdomains via regex in middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=ALLOWED_ORIGINS,
+    allow_origin_regex=r"https://.*\.(poweron\.swiss|poweron-center\.net)",
+    allow_credentials=True,
+    allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
+    allow_headers=["*"],
+    expose_headers=["*"],
+    max_age=86400,
+)
+
+# Static files and templates (for web UI)
+app.mount("/static", StaticFiles(directory="static"), name="static")
+templates = Jinja2Templates(directory="templates")
+
+# ============================================================================
+# API Routes
+# ============================================================================
+
+@app.get("/api/health", response_model=HealthResponse, tags=["System"])
+async def _healthCheck():
+    """Health check endpoint."""
+    ollamaConnected = False
+    try:
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            response = await client.get(f"{CONFIG['ollamaUrl']}/api/tags")
+            ollamaConnected = response.status_code == 200
+    except Exception:
+        pass
+    
+    return HealthResponse(
+        status="ok",
+        service="private-llm",
+        pdfSupport=PDF_SUPPORT,
+        ollamaConnected=ollamaConnected
+    )
+
+@app.get("/api/models", response_model=List[ModelInfo], tags=["Models"])
+async def _listModels(authenticated: bool = Depends(_verifyApiKey)):
+    """List available models with pricing."""
+    models = []
+    for externalName, internalName in MODEL_MAPPING.items():
+        isVision = _isVisionModel(internalName)
+        pricePerCall = 0.10 if isVision else 0.01  # CHF pricing
+        
+        models.append(ModelInfo(
+            name=externalName,
+            internalName=internalName,
+            isVision=isVision,
+            pricePerCall=pricePerCall
+        ))
+    
+    return models
+
+@app.get("/api/ollama/status", response_model=OllamaStatusResponse, tags=["System"])
+async def _ollamaStatus(authenticated: bool = Depends(_verifyApiKey)):
+    """Check Ollama connection status and list available models."""
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            response = await client.get(f"{CONFIG['ollamaUrl']}/api/tags")
+            
+            if response.status_code != 200:
+                return OllamaStatusResponse(
+                    connected=False,
+                    error=f"Ollama responded with status {response.status_code}"
+                )
+            
+            data = response.json()
+            models = [m.get("name", "") for m in data.get("models", [])]
+            visionModels = [m for m in models if _isVisionModel(m)]
+            
+            return OllamaStatusResponse(
+                connected=True,
+                models=models,
+                visionModels=visionModels,
+                totalModels=len(models)
+            )
+    
+    except httpx.ConnectError:
+        return OllamaStatusResponse(
+            connected=False,
+            error="Keine Verbindung zu Ollama. Ist Ollama gestartet?"
+        )
+    except Exception as e:
+        return OllamaStatusResponse(
+            connected=False,
+            error=str(e)
+        )
+
+@app.post("/api/analyze", response_model=AnalyzeResponse, tags=["AI"])
+async def _analyzeDocument(
+    request: AnalyzeRequest,
+    apiKey: str = Depends(_checkRateLimit)
+):
    """
-    Analysiert ein Dokument mit Ollama Vision API oder verarbeitet Text mit Non-Vision Modellen
-    Erwartet: { imageBase64 (optional bei Non-Vision), prompt, ollamaUrl, modelName }
+    Analyze a document with AI Vision API.
+    
+    Supports both vision models (with images) and text models (without images).
    """
    try:
-        data = request.get_json()
+        # Get internal model name
+        internalModelName = _getInternalModelName(request.modelName)
+        isVision = _isVisionModel(internalModelName)
        
-        imageBase64 = data.get('imageBase64')
-        prompt = data.get('prompt')
-        ollamaUrl = data.get('ollamaUrl', 'http://localhost:11434')
-        modelName = data.get('modelName', 'qwen2.5vl:72b')
+        # Validate request
+        if isVision and not request.imageBase64:
+            raise HTTPException(
+                status_code=400,
+                detail="Kein Bild übermittelt (erforderlich für Vision-Modelle)"
+            )
        
-        # Prüfe ob es ein Vision-Modell ist (basierend auf Namenskonvention)
-        isVisionModel = _isVisionModel(modelName)
+        if not request.prompt:
+            raise HTTPException(status_code=400, detail="Kein Prompt übermittelt")
        
-        # Bei Vision-Modellen ist ein Bild erforderlich
-        if isVisionModel and not imageBase64:
-            return jsonify({'error': 'Kein Bild übermittelt (erforderlich für Vision-Modelle)'}), 400
+        # Model-specific context lengths (actual model limits)
+        modelContextLengths = {
+            "deepseek-ocr": 8192,        # 8K context
+            "qwen2.5vl:7b": 32768,       # Use 32K (model supports 125K but RAM limited)
+            "granite3.2-vision": 16000,  # 16K context
+        }
+        numCtx = modelContextLengths.get(internalModelName, 8192)
        
-        if not prompt:
-            return jsonify({'error': 'Kein Prompt übermittelt'}), 400
-        
-        # Request-Body erstellen
+        # Build request body with model-specific context window
        requestBody = {
-            'model': modelName,
-            'prompt': prompt,
-            'stream': False
+            "model": internalModelName,
+            "prompt": request.prompt,
+            "stream": False,
+            "options": {
+                "num_ctx": numCtx
+            }
        }
        
-        # Bilder nur hinzufügen wenn vorhanden (für Vision-Modelle)
-        if imageBase64:
-            requestBody['images'] = [imageBase64]
+        if request.imageBase64:
+            requestBody["images"] = [request.imageBase64]
        
-        # Ollama API aufrufen (Timeout: 60 Minuten für grosse Modelle)
-        response = requests.post(
-            f'{ollamaUrl}/api/generate',
-            json=requestBody,
-            timeout=3600  # 60 Minuten
+        # Call Ollama API
+        async with httpx.AsyncClient(timeout=3600.0) as client:  # 60 min timeout
+            response = await client.post(
+                f"{CONFIG['ollamaUrl']}/api/generate",
+                json=requestBody
+            )
+            
+            if response.status_code == 404:
+                raise HTTPException(
+                    status_code=404,
+                    detail=f'Modell "{internalModelName}" nicht gefunden. Bitte installieren mit: ollama pull {internalModelName}'
+                )
+            
+            if response.status_code != 200:
+                raise HTTPException(
+                    status_code=response.status_code,
+                    detail=f"Ollama API Fehler: {response.status_code} - {response.text[:200]}"
+                )
+            
+            responseData = response.json()
+            responseText = responseData.get("response", "")
+            
+            # Try to extract JSON from response
+            extractedData = None
+            jsonMatch = re.search(r"\{[\s\S]*\}", responseText)
+            
+            if jsonMatch:
+                try:
+                    extractedData = json.loads(jsonMatch.group())
+                except json.JSONDecodeError:
+                    extractedData = None
+            
+            # Wrap plain text response in JSON object
+            if extractedData is None:
+                extractedData = {"response": responseText.strip()}
+            
+            return AnalyzeResponse(
+                success=True,
+                data=extractedData,
+                rawResponse=responseText
+            )
+    
+    except httpx.TimeoutException:
+        return AnalyzeResponse(
+            success=False,
+            error="Zeitüberschreitung bei der Ollama API"
        )
-        
-        if response.status_code == 404:
-            return jsonify({
-                'error': f'Modell "{modelName}" nicht gefunden. Bitte installieren Sie es mit: ollama pull {modelName}'
-            }), 404
-        
-        if response.status_code != 200:
-            return jsonify({
-                'error': f'Ollama API Fehler: {response.status_code} - {response.text[:200]}'
-            }), response.status_code
-        
-        responseData = response.json()
-        responseText = responseData.get('response', '')
-        
-        # Versuche JSON aus der Antwort zu extrahieren
-        extractedData = None
-        jsonMatch = re.search(r'\{[\s\S]*\}', responseText)
-        
-        if jsonMatch:
-            try:
-                extractedData = json.loads(jsonMatch.group())
-            except json.JSONDecodeError:
-                # JSON-ähnlicher Text gefunden, aber ungültig
-                extractedData = None
-        
-        # Wenn kein JSON gefunden, Antwort in JSON-Objekt verpacken
-        if extractedData is None:
-            extractedData = {
-                'response': responseText.strip()
-            }
-        
-        return jsonify({
-            'success': True,
-            'data': extractedData,
-            'rawResponse': responseText
-        })
-        
-    except requests.exceptions.Timeout:
-        return jsonify({'error': 'Zeitüberschreitung bei der Ollama API'}), 504
-    except requests.exceptions.ConnectionError:
-        return jsonify({'error': 'Verbindung zu Ollama fehlgeschlagen. Ist Ollama gestartet?'}), 503
-    except json.JSONDecodeError as e:
-        return jsonify({'error': f'JSON Parse-Fehler: {str(e)}'}), 400
+    except httpx.ConnectError:
+        return AnalyzeResponse(
+            success=False,
+            error="Verbindung zu Ollama fehlgeschlagen. Ist Ollama gestartet?"
+        )
+    except HTTPException:
+        raise
    except Exception as e:
-        return jsonify({'error': f'Unerwarteter Fehler: {str(e)}'}), 500
+        logger.error(f"Error analyzing document: {e}")
+        return AnalyzeResponse(
+            success=False,
+            error=f"Unerwarteter Fehler: {str(e)}"
+        )

-
-@app.route('/api/health', methods=['GET'])
-def _healthCheck():
-    """Health Check Endpoint"""
-    return jsonify({'status': 'ok', 'service': 'belegscanner', 'pdfSupport': PDF_SUPPORT})
-
-
-@app.route('/api/pdf/extract', methods=['POST'])
-@_loginRequired
-def _extractPdfImages():
-    """
-    Extrahiert Bilder aus einem PDF.
-    Erwartet: { pdfBase64, page (optional, default: alle) }
-    """
+@app.post("/api/pdf/extract", tags=["PDF"])
+async def _extractPdfImages(
+    request: PdfExtractRequest,
+    authenticated: bool = Depends(_verifyApiKey)
+):
+    """Extract images from a PDF."""
    if not PDF_SUPPORT:
-        return jsonify({
-            'error': 'PDF-Support nicht verfügbar. Bitte PyMuPDF installieren: pip install pymupdf'
-        }), 501
+        raise HTTPException(
+            status_code=501,
+            detail="PDF-Support nicht verfügbar. Bitte PyMuPDF installieren: pip install pymupdf"
+        )
    
    try:
-        data = request.get_json()
-        pdfBase64 = data.get('pdfBase64')
-        pageNum = data.get('page')  # Optional: spezifische Seite
+        pdfBytes = base64.b64decode(request.pdfBase64)
        
-        if not pdfBase64:
-            return jsonify({'error': 'Kein PDF übermittelt'}), 400
-        
-        # Base64 dekodieren
-        pdfBytes = base64.b64decode(pdfBase64)
-        
-        if pageNum is not None:
-            # Einzelne Seite extrahieren
-            result = _renderPdfPageAsImage(pdfBytes, pageNum - 1)  # 0-basiert
-            return jsonify({
-                'success': True,
-                'image': result
-            })
+        if request.page is not None:
+            # Extract single page
+            result = _renderPdfPageAsImage(pdfBytes, request.page - 1)
+            return {"success": True, "image": result}
        else:
-            # Alle Seiten extrahieren (max 5)
+            # Extract all pages (max 5)
            images = _extractImagesFromPdf(pdfBytes, maxPages=5)
-            return jsonify({
-                'success': True,
-                'images': images,
-                'totalExtracted': len(images)
-            })
-        
-    except Exception as e:
-        return jsonify({'error': f'PDF-Verarbeitungsfehler: {str(e)}'}), 500
-
-
-@app.route('/api/ollama/status', methods=['GET'])
-@_loginRequired
-def _ollamaStatus():
-    """Prüft ob Ollama erreichbar ist und listet verfügbare Modelle"""
-    ollamaUrl = request.args.get('url', 'http://localhost:11434')
+            return {
+                "success": True,
+                "images": images,
+                "totalExtracted": len(images)
+            }
    
-    try:
-        # Prüfe ob Ollama läuft
-        response = requests.get(f'{ollamaUrl}/api/tags', timeout=5)
-        
-        if response.status_code != 200:
-            return jsonify({
-                'connected': False,
-                'error': f'Ollama antwortet mit Status {response.status_code}'
-            })
-        
-        data = response.json()
-        models = [m.get('name', '') for m in data.get('models', [])]
-        
-        # Filtere Vision-Modelle (enthalten oft 'vision', 'vl', 'llava' im Namen)
-        visionModels = [m for m in models if any(x in m.lower() for x in ['vision', 'vl', 'llava', 'bakllava'])]
-        
-        return jsonify({
-            'connected': True,
-            'models': models,
-            'visionModels': visionModels,
-            'totalModels': len(models)
-        })
-        
-    except requests.exceptions.ConnectionError:
-        return jsonify({
-            'connected': False,
-            'error': 'Keine Verbindung zu Ollama. Ist Ollama gestartet?'
-        })
    except Exception as e:
-        return jsonify({
-            'connected': False,
-            'error': str(e)
-        })
+        raise HTTPException(
+            status_code=500,
+            detail=f"PDF-Verarbeitungsfehler: {str(e)}"
+        )

+# ============================================================================
+# Web UI Routes (Optional - for direct browser access)
+# ============================================================================
+
+@app.get("/", response_class=HTMLResponse, tags=["Web UI"])
+async def _index(request: Request):
+    """Main page with document scanner UI."""
+    return templates.TemplateResponse("index.html", {"request": request})
+
+@app.get("/login", response_class=HTMLResponse, tags=["Web UI"])
+async def _loginPage(request: Request):
+    """Login page."""
+    return templates.TemplateResponse("login.html", {"request": request})

 # ============================================================================
 # Main
 # ============================================================================

-if __name__ == '__main__':
-    print("\n" + "="*60)
-    print("  Belegscanner - KI-Dokumentenanalyse")
-    print("  Powered by Poweron")
-    print("="*60)
-    print("\n  Server läuft auf: http://localhost:5000")
-    print("  CORS ist aktiviert für alle Origins")
-    print("\n  Drücke Ctrl+C zum Beenden")
-    print("="*60 + "\n")
+if __name__ == "__main__":
+    import uvicorn
    
-    app.run(host='0.0.0.0', port=5000, debug=True)
+    print("\n" + "=" * 60)
+    print("  Private-LLM Service - KI-Dokumentenanalyse")
+    print("  Powered by PowerOn")
+    print("=" * 60)
+    print(f"\n  Server läuft auf: http://localhost:5000")
+    print(f"  API Docs: http://localhost:5000/docs")
+    print(f"  Ollama URL: {CONFIG['ollamaUrl']}")
+    print("\n  Drücke Ctrl+C zum Beenden")
+    print("=" * 60 + "\n")
+    
+    uvicorn.run(app, host="0.0.0.0", port=5000)
--- a/config.ini
+++ b/config.ini
@ -0,0 +1,21 @@
+# Private-LLM Configuration
+# =========================
+
+# API Key für eingehende Requests (Gateway authentifiziert sich damit)
+# Muss mit Connector_AiPrivateLlm_API_SECRET in Gateway env-Files übereinstimmen
+# Key generieren: python -c "import secrets; print(secrets.token_urlsafe(32))"
+PRIVATE_LLM_API_KEY = jL4vyNfh_tv4rxoRaHKW88sVWNHbj32GsxuKE2A8bf0
+
+# Ollama Server URL
+OLLAMA_URL = http://localhost:11434
+
+# Web UI Authentication (optional, für direkten Browser-Zugriff)
+AUTH_USERNAME = poweron
+AUTH_PASSWORD = poweron
+
+# FastAPI Secret Key (für Session-Management)
+SECRET_KEY = c8bc1cede035171dedf01f220623e185aa8b83670ef607e97d928d271ac94200
+
+# Rate Limiting
+RATE_LIMIT_REQUESTS_PER_MINUTE = 60
+RATE_LIMIT_BURST_SIZE = 10
--- a/docu/requirements.txt
+++ b/docu/requirements.txt
@ -0,0 +1,16 @@
+# FastAPI and dependencies
+fastapi>=0.109.0
+uvicorn[standard]>=0.27.0
+python-multipart>=0.0.6
+httpx>=0.26.0
+pydantic>=2.5.0
+
+# Templating for web UI
+jinja2>=3.1.0
+aiofiles>=23.0.0
+
+# PDF Support
+pymupdf>=1.24.0
+
+# Production server
+gunicorn>=21.0.0
--- a/docu/setupserver.md
+++ b/docu/setupserver.md
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +0,0 @@
-flask>=3.0.0
-flask-cors>=4.0.0
-requests>=2.31.0
-werkzeug>=3.0.0
-pymupdf>=1.24.0
-gunicorn>=21.0.0
--- a/start-python.bat
+++ b/start-python.bat
@ -1,8 +1,8 @@
@echo off
 chcp 65001 >nul
 echo ============================================================
-echo   Belegscanner - KI-Dokumentenanalyse
-echo   Powered by Poweron
+echo   Private-LLM Service - KI-Dokumentenanalyse
+echo   Powered by PowerOn (FastAPI + Uvicorn)
 echo ============================================================
 echo.

@ -31,11 +31,14 @@ REM Dependencies installieren
 echo [2/3] Installiere Python Dependencies...
 pip install -r requirements.txt --quiet

-echo [3/3] Starte Python Flask Server...
+echo [3/3] Starte FastAPI Server (Uvicorn)...
+echo.
+echo Server URL:    http://localhost:5000
+echo API Docs:      http://localhost:5000/docs
+echo OpenAPI JSON:  http://localhost:5000/openapi.json
 echo.
-echo Server URL: http://localhost:5000
 echo Druecke Ctrl+C zum Beenden
 echo.

-REM Flask starten
-python app.py
+REM FastAPI mit Uvicorn starten
+uvicorn app:app --host 0.0.0.0 --port 5000 --reload
--- a/start-python.ps1
+++ b/start-python.ps1
@ -1,9 +1,9 @@
-# Belegscanner - Python Web App Starter
-# Poweron Design
+# Private-LLM Service - FastAPI Starter
+# Powered by PowerOn

 Write-Host "============================================================" -ForegroundColor Cyan
-Write-Host "  Belegscanner - KI-Dokumentenanalyse" -ForegroundColor White
-Write-Host "  Powered by Poweron" -ForegroundColor Magenta
+Write-Host "  Private-LLM Service - KI-Dokumentenanalyse" -ForegroundColor White
+Write-Host "  Powered by PowerOn (FastAPI + Uvicorn)" -ForegroundColor Magenta
 Write-Host "============================================================" -ForegroundColor Cyan
 Write-Host ""

@ -44,11 +44,14 @@ Start-Sleep -Seconds 2
 Write-Host "[2/3] Installiere Python Dependencies..." -ForegroundColor Yellow
 pip install -r requirements.txt --quiet

-Write-Host "[3/3] Starte Flask Server..." -ForegroundColor Yellow
+Write-Host "[3/3] Starte FastAPI Server (Uvicorn)..." -ForegroundColor Yellow
+Write-Host ""
+Write-Host "Server URL:    http://localhost:5000" -ForegroundColor Green
+Write-Host "API Docs:      http://localhost:5000/docs" -ForegroundColor Green
+Write-Host "OpenAPI JSON:  http://localhost:5000/openapi.json" -ForegroundColor Gray
 Write-Host ""
-Write-Host "Server URL: http://localhost:5000" -ForegroundColor Green
 Write-Host "Druecke Ctrl+C zum Beenden" -ForegroundColor Gray
 Write-Host ""

-# Flask Server starten
-python app.py
+# FastAPI Server mit Uvicorn starten
+uvicorn app:app --host 0.0.0.0 --port 5000 --reload
--- a/t1.png
+++ b/t1.png
--- a/templates/index.html
+++ b/templates/index.html
@ -807,6 +807,31 @@ Falls ein Feld nicht erkennbar ist, setze den Wert auf null.</textarea>
        // Ollama Status prüfen
        checkOllamaBtn.addEventListener('click', _checkOllamaStatus);

+        // PowerOn Model Definitions (must match app.py MODEL_MAPPING)
+        const POWERON_MODELS = [
+            {
+                name: 'poweron-vision-general',
+                displayName: 'PowerOn Vision General',
+                description: 'Handschrift & allgemeine Bilder (qwen2.5vl:7b)',
+                isVision: true,
+                ollamaModel: 'qwen2.5vl:7b'
+            },
+            {
+                name: 'poweron-vision-deep',
+                displayName: 'PowerOn Vision Deep',
+                description: 'Rechnungen, Belege, Dokumente (granite3.2-vision)',
+                isVision: true,
+                ollamaModel: 'granite3.2-vision'
+            },
+            {
+                name: 'poweron-ocr-general',
+                displayName: 'PowerOn OCR General',
+                description: 'Text-Extraktion / OCR (deepseek-ocr)',
+                isVision: true,
+                ollamaModel: 'deepseek-ocr'
+            }
+        ];
+
        async function _checkOllamaStatus() {
            ollamaStatusDiv.style.display = 'block';
            ollamaStatusDiv.className = 'ollama-status loading';
@ -819,45 +844,31 @@ Falls ein Feld nicht erkennbar ist, setze den Wert auf null.</textarea>
                if (result.connected) {
                    ollamaStatusDiv.className = 'ollama-status success';
                    
-                    // Modelle in Dropdown laden
+                    // PowerOn Modelle in Dropdown laden (nur wenn Backend-Modell verfügbar)
                    modelName.innerHTML = '';
                    
-                    if (result.visionModels && result.visionModels.length > 0) {
+                    const availableModels = result.models || [];
+                    const availablePowerOnModels = POWERON_MODELS.filter(pm => 
+                        availableModels.some(m => m.startsWith(pm.ollamaModel.split(':')[0]))
+                    );
+                    
+                    if (availablePowerOnModels.length > 0) {
                        const optGroup = document.createElement('optgroup');
-                        optGroup.label = 'Vision Modelle (empfohlen)';
-                        result.visionModels.forEach(model => {
+                        optGroup.label = 'PowerOn Modelle';
+                        availablePowerOnModels.forEach(model => {
                            const opt = document.createElement('option');
-                            opt.value = model;
-                            opt.textContent = model;
+                            opt.value = model.name;
+                            opt.textContent = `${model.displayName}`;
+                            opt.title = model.description;
                            optGroup.appendChild(opt);
                        });
                        modelName.appendChild(optGroup);
+                        
+                        // Erstes Modell auswählen
+                        modelName.value = availablePowerOnModels[0].name;
                    }

-                    if (result.models && result.models.length > 0) {
-                        const otherModels = result.models.filter(m => 
-                            !result.visionModels || !result.visionModels.includes(m)
-                        );
-                        if (otherModels.length > 0) {
-                            const optGroup = document.createElement('optgroup');
-                            optGroup.label = 'Andere Modelle';
-                            otherModels.forEach(model => {
-                                const opt = document.createElement('option');
-                                opt.value = model;
-                                opt.textContent = model;
-                                optGroup.appendChild(opt);
-                            });
-                            modelName.appendChild(optGroup);
-                        }
-                    }
-
-                    // Erstes Vision-Modell auswählen falls vorhanden
-                    if (result.visionModels && result.visionModels.length > 0) {
-                        modelName.value = result.visionModels[0];
-                    }
-
-                    ollamaStatusDiv.innerHTML = `✓ Verbunden - ${result.totalModels} Modelle gefunden` +
-                        (result.visionModels?.length ? ` (${result.visionModels.length} Vision-Modelle)` : '');
+                    ollamaStatusDiv.innerHTML = `✓ Verbunden - ${availablePowerOnModels.length} PowerOn Modelle verfügbar`;
                    
                    // Button-Status nach Modell-Laden aktualisieren
                    _updateAnalyzeButtonState();
@ -875,8 +886,12 @@ Falls ein Feld nicht erkennbar ist, setze den Wert auf null.</textarea>
        // Helper: Prüft ob Modell ein Vision-Modell ist
        function _isVisionModel(model) {
            if (!model) return true; // Default: als Vision behandeln
+            // Check PowerOn models first
+            const powerOnModel = POWERON_MODELS.find(pm => pm.name === model);
+            if (powerOnModel) return powerOnModel.isVision;
+            // Fallback for direct Ollama model names
            const modelLower = model.toLowerCase();
-            return ['vision', 'vl', 'llava', 'bakllava'].some(indicator => modelLower.includes(indicator));
+            return ['vision', 'vl', 'llava', 'bakllava', 'granite', 'deepseek-ocr'].some(indicator => modelLower.includes(indicator));
        }

        // Button-Status basierend auf Modell und Bild aktualisieren