From f9b91501d84031a43d39e0db0fa9a481b052d6d7 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Fri, 6 Feb 2026 14:45:52 +0100 Subject: [PATCH] Reduce context size for vision models to fit in RAM Co-authored-by: Cursor --- app.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/app.py b/app.py index 0b7c73c..40a1a73 100644 --- a/app.py +++ b/app.py @@ -551,13 +551,17 @@ async def _analyzeDocument( if not request.prompt: raise HTTPException(status_code=400, detail="Kein Prompt übermittelt") - # Model-specific context lengths (actual model limits) + # Model-specific context lengths (reduced for RAM constraints) + # Server has 31GB RAM + 22GB GPU - vision models need smaller context modelContextLengths = { - "qwen2.5:7b": 32768, # Use 32K (model supports 128K but RAM limited) - "qwen2.5vl:7b": 32768, # Use 32K (model supports 125K but RAM limited) - "granite3.2-vision": 16000, # 16K context + "qwen2.5:7b": 8192, # Text model - 8K context + "qwen2.5vl:7b": 4096, # Vision model - 4K context (images use lots of RAM) + "granite3.2-vision": 4096, # Vision model - 4K context + "granite3.2-vision:latest": 4096, + "deepseek-ocr": 4096, # OCR model - 4K context + "deepseek-ocr:latest": 4096, } - numCtx = modelContextLengths.get(internalModelName, 8192) + numCtx = modelContextLengths.get(internalModelName, 4096) # Build request body with model-specific context window requestBody = {