Reduce context size for vision models to fit in RAM

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
ValueOn AG 2026-02-06 14:45:52 +01:00
parent 11c9b64e14
commit f9b91501d8

14
app.py
View file

@ -551,13 +551,17 @@ async def _analyzeDocument(
if not request.prompt:
raise HTTPException(status_code=400, detail="Kein Prompt übermittelt")
# Model-specific context lengths (actual model limits)
# Model-specific context lengths (reduced for RAM constraints)
# Server has 31GB RAM + 22GB GPU - vision models need smaller context
modelContextLengths = {
"qwen2.5:7b": 32768, # Use 32K (model supports 128K but RAM limited)
"qwen2.5vl:7b": 32768, # Use 32K (model supports 125K but RAM limited)
"granite3.2-vision": 16000, # 16K context
"qwen2.5:7b": 8192, # Text model - 8K context
"qwen2.5vl:7b": 4096, # Vision model - 4K context (images use lots of RAM)
"granite3.2-vision": 4096, # Vision model - 4K context
"granite3.2-vision:latest": 4096,
"deepseek-ocr": 4096, # OCR model - 4K context
"deepseek-ocr:latest": 4096,
}
numCtx = modelContextLengths.get(internalModelName, 8192)
numCtx = modelContextLengths.get(internalModelName, 4096)
# Build request body with model-specific context window
requestBody = {