From f9b91501d84031a43d39e0db0fa9a481b052d6d7 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Fri, 6 Feb 2026 14:45:52 +0100
Subject: [PATCH] Reduce context size for vision models to fit in RAM
Co-authored-by: Cursor
---
app.py | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/app.py b/app.py
index 0b7c73c..40a1a73 100644
--- a/app.py
+++ b/app.py
@@ -551,13 +551,17 @@ async def _analyzeDocument(
if not request.prompt:
raise HTTPException(status_code=400, detail="Kein Prompt übermittelt")
- # Model-specific context lengths (actual model limits)
+ # Model-specific context lengths (reduced for RAM constraints)
+ # Server has 31GB RAM + 22GB GPU - vision models need smaller context
modelContextLengths = {
- "qwen2.5:7b": 32768, # Use 32K (model supports 128K but RAM limited)
- "qwen2.5vl:7b": 32768, # Use 32K (model supports 125K but RAM limited)
- "granite3.2-vision": 16000, # 16K context
+ "qwen2.5:7b": 8192, # Text model - 8K context
+ "qwen2.5vl:7b": 4096, # Vision model - 4K context (images use lots of RAM)
+ "granite3.2-vision": 4096, # Vision model - 4K context
+ "granite3.2-vision:latest": 4096,
+ "deepseek-ocr": 4096, # OCR model - 4K context
+ "deepseek-ocr:latest": 4096,
}
- numCtx = modelContextLengths.get(internalModelName, 8192)
+ numCtx = modelContextLengths.get(internalModelName, 4096)
# Build request body with model-specific context window
requestBody = {