gateway/test_ai_model_selection.py

310 lines
13 KiB
Python

#!/usr/bin/env python3
"""
AI Model Selection Test - Prints prioritized fallback model lists used for AI calls
Scenarios mirror typical calls in workflows/ (task planning, action planning,
analysis, and react-mode decisions), showing which models are shortlisted and
their final prioritized order after rating and cost tie-breaking.
"""
import asyncio
import os
import sys
from typing import List, Tuple
# Ensure gateway is on path when running directly
sys.path.append(os.path.dirname(__file__))
from modules.features.chatPlayground.mainChatPlayground import getServices
from modules.datamodels.datamodelAi import (
AiCallOptions,
OperationTypeEnum,
PriorityEnum,
ProcessingModeEnum,
)
from modules.datamodels.datamodelUam import User
from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.aicore.aicoreModelSelector import modelSelector
class ModelSelectionTester:
def __init__(self) -> None:
testUser = User(
id="test_user_models",
username="test_models",
email="test@example.com",
fullName="Test Models",
language="en",
mandateId="test_mandate",
)
self.services = getServices(testUser, None)
async def initialize(self) -> None:
from modules.services.serviceAi.mainServiceAi import AiService
self.services.ai = await AiService.create(self.services)
async def _printFallbackListWithContext(self, title: str, prompt: str, context: str, options: AiCallOptions) -> None:
print(f"\n{'='*80}")
print(f"{title}")
print(f"{'='*80}")
print(
f"Operation={options.operationType.name}, Priority={options.priority.name}, ProcessingMode={options.processingMode.name}"
)
# Show context and prompt sizes
promptSize = len(prompt.encode("utf-8"))
contextSize = len(context.encode("utf-8"))
totalSize = promptSize + contextSize
print(f"Prompt size: {promptSize} bytes, Context size: {contextSize} bytes, Total: {totalSize} bytes")
availableModels = modelRegistry.getAvailableModels()
failoverModelList = modelSelector.getFailoverModelList(
prompt=prompt,
context=context,
options=options,
availableModels=availableModels,
)
if not failoverModelList:
print("No suitable models found (capability filter returned empty list).")
return
print("Prioritized fallback model sequence (name | quality | speed | $/1k in | ctx | score):")
for idx, m in enumerate(failoverModelList, 1):
costIn = getattr(m, "costPer1kTokensInput", 0.0)
# Calculate detailed score breakdown
promptSize = len(prompt.encode("utf-8"))
contextSize = len(context.encode("utf-8"))
totalSize = promptSize + contextSize
# Get detailed scoring
sizeRating = modelSelector._getSizeRating(m, totalSize)
processingModeRating = modelSelector._getProcessingModeRating(m.processingMode, options.processingMode)
priorityRating = modelSelector._getPriorityRating(m, options.priority)
totalScore = sizeRating + processingModeRating + priorityRating
print(
f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
)
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
async def _printFallbackList(self, title: str, prompt: str, options: AiCallOptions) -> None:
print(f"\n{'='*80}")
print(f"{title}")
print(f"{'='*80}")
print(
f"Operation={options.operationType.name}, Priority={options.priority.name}, ProcessingMode={options.processingMode.name}"
)
# Show context and prompt sizes
context = "" # Currently using empty context
promptSize = len(prompt.encode("utf-8"))
contextSize = len(context.encode("utf-8"))
totalSize = promptSize + contextSize
print(f"Prompt size: {promptSize} bytes, Context size: {contextSize} bytes, Total: {totalSize} bytes")
availableModels = modelRegistry.getAvailableModels()
failoverModelList = modelSelector.getFailoverModelList(
prompt=prompt,
context=context,
options=options,
availableModels=availableModels,
)
if not failoverModelList:
print("No suitable models found (capability filter returned empty list).")
return
print("Prioritized fallback model sequence (name | quality | speed | $/1k in | ctx | score):")
for idx, m in enumerate(failoverModelList, 1):
costIn = getattr(m, "costPer1kTokensInput", 0.0)
# Calculate detailed score breakdown
promptSize = len(prompt.encode("utf-8"))
contextSize = len(context.encode("utf-8"))
totalSize = promptSize + contextSize
# Get detailed scoring
sizeRating = modelSelector._getSizeRating(m, totalSize)
processingModeRating = modelSelector._getProcessingModeRating(m.processingMode, options.processingMode)
priorityRating = modelSelector._getPriorityRating(m, options.priority)
totalScore = sizeRating + processingModeRating + priorityRating
print(
f" {idx:>2}. {m.name} | Q={getattr(m, 'qualityRating', 0)} | S={getattr(m, 'speedRating', 0)} | ${costIn:.4f} | ctx={getattr(m, 'contextLength', 0)} | score={totalScore:.3f}"
)
print(f" Size: {sizeRating:.3f}, ProcessingMode: {processingModeRating:.3f}, Priority: {priorityRating:.3f}")
async def run(self) -> None:
# Scenarios reflecting workflows/
scenarios: List[Tuple[str, str, AiCallOptions]] = []
# Task planning (taskPlanner, modeActionplan)
scenarios.append(
(
"PLAN - Quality, Detailed",
"Task planning for a multi-step business workflow.",
AiCallOptions(
operationType=OperationTypeEnum.PLAN,
priority=PriorityEnum.QUALITY,
compressPrompt=False,
compressContext=False,
processingMode=ProcessingModeEnum.DETAILED,
maxCost=0.10,
maxProcessingTime=30,
),
)
)
# Result validation / analysis (modeActionplan)
scenarios.append(
(
"ANALYSE - Balanced, Advanced",
"Validate action plan correctness and completeness.",
AiCallOptions(
operationType=OperationTypeEnum.ANALYSE,
priority=PriorityEnum.BALANCED,
compressPrompt=True,
compressContext=False,
processingMode=ProcessingModeEnum.ADVANCED,
maxCost=0.05,
maxProcessingTime=30,
),
)
)
# React mode - action selection (modeReact)
scenarios.append(
(
"GENERAL - Balanced, Advanced (React: action selection)",
"Select next best action from context and state.",
AiCallOptions(
operationType=OperationTypeEnum.GENERAL,
priority=PriorityEnum.BALANCED,
compressPrompt=True,
compressContext=True,
processingMode=ProcessingModeEnum.ADVANCED,
maxCost=0.03,
maxProcessingTime=20,
),
)
)
# React mode - parameter suggestion (modeReact example)
scenarios.append(
(
"ANALYSE - Balanced, Advanced (React: parameter suggestion)",
"Suggest parameters for the selected action as JSON.",
AiCallOptions(
operationType=OperationTypeEnum.ANALYSE,
priority=PriorityEnum.BALANCED,
compressPrompt=True,
compressContext=False,
processingMode=ProcessingModeEnum.ADVANCED,
maxCost=0.05,
maxProcessingTime=30,
resultFormat="json",
temperature=0.3,
),
)
)
# Intent analysis (user input understanding)
scenarios.append(
(
"ANALYSE - Quality, Detailed (Intent Analysis)",
"Analyze user intent and extract key requirements from the following request: 'I need to create a comprehensive marketing strategy for our new product launch including budget allocation, timeline, and target audience analysis.'",
AiCallOptions(
operationType=OperationTypeEnum.ANALYSE,
priority=PriorityEnum.QUALITY,
compressPrompt=False,
compressContext=False,
processingMode=ProcessingModeEnum.DETAILED,
maxCost=0.08,
maxProcessingTime=45,
resultFormat="json",
temperature=0.2,
),
)
)
# Review/Validation (quality assurance)
scenarios.append(
(
"ANALYSE - Quality, Detailed (Review/Validation)",
"Review and validate the following business proposal for completeness, accuracy, and compliance with industry standards. Identify any gaps or areas for improvement.",
AiCallOptions(
operationType=OperationTypeEnum.ANALYSE,
priority=PriorityEnum.QUALITY,
compressPrompt=False,
compressContext=False,
processingMode=ProcessingModeEnum.DETAILED,
maxCost=0.10,
maxProcessingTime=60,
resultFormat="json",
temperature=0.1,
),
)
)
# Large context scenario (to test size-based scoring)
scenarios.append(
(
"GENERAL - Balanced, Advanced (Large Context Test)",
"Process this large document and provide a comprehensive summary.",
AiCallOptions(
operationType=OperationTypeEnum.GENERAL,
priority=PriorityEnum.BALANCED,
compressPrompt=False,
compressContext=False,
processingMode=ProcessingModeEnum.ADVANCED,
maxCost=0.15,
maxProcessingTime=120,
),
)
)
# Iterate and print lists
for title, prompt, options in scenarios:
await self._printFallbackList(title, prompt, options)
# Test with actual context to see size-based scoring
largeContext = """
This is a comprehensive business document containing detailed information about our company's strategic initiatives,
financial performance, market analysis, competitive landscape, operational metrics, customer feedback,
product development roadmap, technology stack, human resources, legal compliance, risk management,
sustainability efforts, and future growth plans. The document spans multiple sections including executive summary,
market research, financial statements, operational reports, customer insights, product specifications,
technology architecture, HR policies, legal frameworks, risk assessments, environmental impact studies,
and strategic recommendations. This extensive content is designed to test the model selection algorithm's
ability to handle large context sizes and make intelligent decisions about which models are best suited
for processing such substantial amounts of information while maintaining efficiency and cost-effectiveness.
""" * 10 # Repeat to make it even larger
await self._printFallbackListWithContext(
"GENERAL - Balanced, Advanced (Large Context Test)",
"Analyze this comprehensive business document and provide key insights.",
largeContext,
AiCallOptions(
operationType=OperationTypeEnum.GENERAL,
priority=PriorityEnum.BALANCED,
compressPrompt=False,
compressContext=False,
processingMode=ProcessingModeEnum.ADVANCED,
maxCost=0.15,
maxProcessingTime=120,
),
)
async def main() -> None:
tester = ModelSelectionTester()
await tester.initialize()
await tester.run()
if __name__ == "__main__":
asyncio.run(main())