Reduce context size for vision models to fit in RAM

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-06 14:45:52 +01:00 · 2026-02-06 14:45:52 +01:00 · f9b91501d8
commit f9b91501d8
parent 11c9b64e14
1 changed files with 9 additions and 5 deletions
--- a/app.py
+++ b/app.py
@ -551,13 +551,17 @@ async def _analyzeDocument(
        if not request.prompt:
            raise HTTPException(status_code=400, detail="Kein Prompt übermittelt")
        
-        # Model-specific context lengths (actual model limits)
+        # Model-specific context lengths (reduced for RAM constraints)
+        # Server has 31GB RAM + 22GB GPU - vision models need smaller context
        modelContextLengths = {
-            "qwen2.5:7b": 32768,         # Use 32K (model supports 128K but RAM limited)
-            "qwen2.5vl:7b": 32768,       # Use 32K (model supports 125K but RAM limited)
-            "granite3.2-vision": 16000,  # 16K context
+            "qwen2.5:7b": 8192,          # Text model - 8K context
+            "qwen2.5vl:7b": 4096,        # Vision model - 4K context (images use lots of RAM)
+            "granite3.2-vision": 4096,   # Vision model - 4K context
+            "granite3.2-vision:latest": 4096,
+            "deepseek-ocr": 4096,        # OCR model - 4K context
+            "deepseek-ocr:latest": 4096,
        }
-        numCtx = modelContextLengths.get(internalModelName, 8192)
+        numCtx = modelContextLengths.get(internalModelName, 4096)
        
        # Build request body with model-specific context window
        requestBody = {