Reduce context size for vision models to fit in RAM
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
11c9b64e14
commit
f9b91501d8
1 changed files with 9 additions and 5 deletions
14
app.py
14
app.py
|
|
@ -551,13 +551,17 @@ async def _analyzeDocument(
|
|||
if not request.prompt:
|
||||
raise HTTPException(status_code=400, detail="Kein Prompt übermittelt")
|
||||
|
||||
# Model-specific context lengths (actual model limits)
|
||||
# Model-specific context lengths (reduced for RAM constraints)
|
||||
# Server has 31GB RAM + 22GB GPU - vision models need smaller context
|
||||
modelContextLengths = {
|
||||
"qwen2.5:7b": 32768, # Use 32K (model supports 128K but RAM limited)
|
||||
"qwen2.5vl:7b": 32768, # Use 32K (model supports 125K but RAM limited)
|
||||
"granite3.2-vision": 16000, # 16K context
|
||||
"qwen2.5:7b": 8192, # Text model - 8K context
|
||||
"qwen2.5vl:7b": 4096, # Vision model - 4K context (images use lots of RAM)
|
||||
"granite3.2-vision": 4096, # Vision model - 4K context
|
||||
"granite3.2-vision:latest": 4096,
|
||||
"deepseek-ocr": 4096, # OCR model - 4K context
|
||||
"deepseek-ocr:latest": 4096,
|
||||
}
|
||||
numCtx = modelContextLengths.get(internalModelName, 8192)
|
||||
numCtx = modelContextLengths.get(internalModelName, 4096)
|
||||
|
||||
# Build request body with model-specific context window
|
||||
requestBody = {
|
||||
|
|
|
|||
Loading…
Reference in a new issue