gateway/test_ai_behavior.py

360 lines
15 KiB
Python

#!/usr/bin/env python3
"""
AI Behavior Test - Tests actual AI responses with different prompt structures
"""
import asyncio
import json
import sys
import os
from typing import Dict, Any, List
# Add the gateway to path
sys.path.append(os.path.dirname(__file__))
# Import the service initialization
from modules.features.chatPlayground.mainChatPlayground import getServices
from modules.datamodels.datamodelAi import AiCallOptions, OperationType
from modules.datamodels.datamodelUam import User
# The test uses the AI service which handles JSON template internally
class AIBehaviorTester:
def __init__(self):
# Create a minimal user context for testing
testUser = User(
id="test_user",
username="test_user",
email="test@example.com",
fullName="Test User",
language="en",
mandateId="test_mandate"
)
# Initialize services using the existing system
self.services = getServices(testUser, None) # Test user, no workflow
self.testResults = []
async def initialize(self):
"""Initialize the AI service."""
# Set logging level to DEBUG to see debug messages
import logging
logging.getLogger().setLevel(logging.DEBUG)
# The AI service needs to be recreated with proper initialization
from modules.services.serviceAi.mainServiceAi import AiService
self.services.ai = await AiService.create(self.services)
# Create a minimal workflow context
from modules.datamodels.datamodelChat import ChatWorkflow
import uuid
self.services.currentWorkflow = ChatWorkflow(
id=str(uuid.uuid4()),
name="Test Workflow",
status="running",
startedAt=self.services.utils.timestampGetUtc(),
lastActivity=self.services.utils.timestampGetUtc(),
currentRound=1,
currentTask=0,
currentAction=0,
totalTasks=0,
totalActions=0,
mandateId="test_mandate",
messageIds=[],
workflowMode="React",
maxSteps=5
)
async def testPromptBehavior(self, promptName: str, prompt: str, maxIterations: int = 2) -> Dict[str, Any]:
"""Test actual AI behavior with a specific prompt structure."""
print(f"\n{'='*60}")
print(f"TESTING AI BEHAVIOR: {promptName}")
print(f"{'='*60}")
print(f"User prompt: {prompt}")
print(f"Prompt length: {len(prompt)} characters")
accumulatedContent = []
# Use the AI service directly with the user prompt - it will build the generation prompt internally
try:
# Use the existing AI service with JSON format - it handles looping internally
response = await self.services.ai.coreAi.callAiDocuments(
prompt=prompt, # Use the raw user prompt directly
documents=None,
outputFormat="json",
title="Prime Numbers Test",
loopInstructionFormat="json" # Use the JSON loop instructions
)
if isinstance(response, dict):
result = json.dumps(response, indent=2)
else:
result = str(response)
print(f"Response length: {len(result)} characters")
print(f"Response preview: {result[:200]}...")
# If we got an error response, try to extract the actual AI content from debug files
if isinstance(response, dict) and not response.get("success", True):
# The AI service wrapped the response in an error format
# We need to get the actual AI content from the debug files
print("⚠️ AI returned error response, but may have generated content")
# Try to read the actual AI response from debug files
debug_content = self._getLatestDebugResponse()
if debug_content:
result = debug_content
print(f"📄 Found debug content: {len(result)} characters")
print(f"📄 Debug preview: {result[:200]}...")
# Parse and analyze response
parsed_result = self._parseJsonResponse(result)
if parsed_result:
# Check if continuation
if parsed_result.get("continuation") is not None:
continuation_text = parsed_result.get("continuation", "")
print(f"✅ Continuation detected: {continuation_text[:100]}...")
accumulatedContent.append(result)
# Analyze continuation quality
continuation_quality = self._analyzeContinuationQuality(continuation_text)
print(f" Continuation quality: {continuation_quality['score']}/10")
print(f" Issues: {', '.join(continuation_quality['issues'])}")
else:
print("✅ Final response received")
accumulatedContent.append(result)
else:
print("❌ Invalid JSON response")
accumulatedContent.append(result)
except Exception as e:
print(f"❌ Error in AI call: {str(e)}")
accumulatedContent.append("")
# Analyze results
result = self._analyzeBehaviorResults(promptName, accumulatedContent)
self.testResults.append(result)
return result
def _extractContinuationInstruction(self, response: str) -> str:
"""Extract continuation instruction from response."""
try:
parsed = json.loads(response)
return parsed.get("continuation", "")
except:
return ""
def _getLatestDebugResponse(self) -> str:
"""Get the latest AI response from debug files."""
try:
import glob
import os
# Look for the most recent debug response file
debug_pattern = "local/logs/debug/prompts/*document_generation_response*.txt"
debug_files = glob.glob(debug_pattern)
if debug_files:
# Sort by modification time, get the most recent
latest_file = max(debug_files, key=os.path.getmtime)
with open(latest_file, 'r', encoding='utf-8') as f:
return f.read()
return ""
except Exception as e:
print(f"Error reading debug file: {e}")
return ""
def _parseJsonResponse(self, response: str) -> Dict[str, Any]:
"""Parse JSON response."""
try:
# First try direct JSON parsing
return json.loads(response)
except:
try:
# Try extracting JSON from markdown code blocks
if "```json" in response:
start = response.find("```json") + 7
end = response.find("```", start)
if end > start:
json_str = response[start:end].strip()
return json.loads(json_str)
elif "```" in response:
start = response.find("```") + 3
end = response.find("```", start)
if end > start:
json_str = response[start:end].strip()
return json.loads(json_str)
return None
except:
return None
def _analyzeContinuationQuality(self, continuation_text: str) -> Dict[str, Any]:
"""Analyze the quality of continuation instructions."""
score = 10
issues = []
try:
# Parse the continuation object
if isinstance(continuation_text, str):
continuation_obj = json.loads(continuation_text)
else:
continuation_obj = continuation_text
# Check for required fields
if not isinstance(continuation_obj, dict):
score -= 5
issues.append("Not a valid object")
return {"score": max(0, score), "issues": issues}
# Check for last_data_items
if "last_data_items" not in continuation_obj:
score -= 3
issues.append("Missing last_data_items")
elif not continuation_obj["last_data_items"]:
score -= 2
issues.append("Empty last_data_items")
# Check for next_instruction
if "next_instruction" not in continuation_obj:
score -= 3
issues.append("Missing next_instruction")
elif not continuation_obj["next_instruction"]:
score -= 2
issues.append("Empty next_instruction")
# Check for specific data points in last_data_items
if "last_data_items" in continuation_obj:
last_items = continuation_obj["last_data_items"]
if not any(char.isdigit() for char in str(last_items)):
score -= 1
issues.append("No specific numbers in last_data_items")
# Check for clear instruction in next_instruction
if "next_instruction" in continuation_obj:
instruction = continuation_obj["next_instruction"]
if "continue" not in instruction.lower():
score -= 1
issues.append("No 'continue' in next_instruction")
except (json.JSONDecodeError, TypeError):
score -= 5
issues.append("Invalid JSON format")
return {
"score": max(0, score),
"issues": issues
}
def _analyzeBehaviorResults(self, promptName: str, accumulatedContent: List[str]) -> Dict[str, Any]:
"""Analyze AI behavior results."""
totalContentLength = 0
iterations = len(accumulatedContent)
continuationInstructions = []
continuationQualities = []
for i, content in enumerate(accumulatedContent):
parsed = self._parseJsonResponse(content)
if parsed:
# Count content length in the response
contentLength = len(content)
totalContentLength += contentLength
continuation = parsed.get("continuation")
if continuation:
continuationInstructions.append(continuation)
quality = self._analyzeContinuationQuality(continuation)
continuationQualities.append(quality)
# Calculate averages
avgContinuationQuality = sum(q["score"] for q in continuationQualities) / len(continuationQualities) if continuationQualities else 0
return {
"promptName": promptName,
"iterations": iterations,
"totalContentLength": totalContentLength,
"continuationInstructions": continuationInstructions,
"avgContinuationQuality": avgContinuationQuality,
"success": totalContentLength > 0,
"efficiency": totalContentLength / iterations if iterations > 0 else 0
}
def _countPrimesInResponse(self, parsed: Dict[str, Any]) -> int:
"""Count prime numbers in the parsed response."""
count = 0
if "documents" in parsed:
for doc in parsed["documents"]:
if "sections" in doc:
for section in doc["sections"]:
if section.get("content_type") == "table" and "elements" in section:
for element in section["elements"]:
if "rows" in element:
for row in element["rows"]:
for cell in row:
if isinstance(cell, (str, int)) and str(cell).isdigit():
count += 1
return count
def printBehaviorResults(self):
"""Print AI behavior test results."""
print(f"\n{'='*80}")
print("AI BEHAVIOR TEST RESULTS")
print(f"{'='*80}")
for result in self.testResults:
print(f"\n{result['promptName']}:")
print(f" Iterations: {result['iterations']}")
print(f" Total Content Length: {result['totalContentLength']}")
print(f" Efficiency: {result['efficiency']:.1f} chars/iteration")
print(f" Avg Continuation Quality: {result['avgContinuationQuality']:.1f}/10")
print(f" Success: {'' if result['success'] else ''}")
if result['continuationInstructions']:
print(f" Continuation Instructions:")
for i, instruction in enumerate(result['continuationInstructions']):
print(f" {i+1}: {instruction[:80]}...")
# Find best performing prompt
if self.testResults:
bestEfficiency = max(self.testResults, key=lambda x: x['efficiency'])
bestQuality = max(self.testResults, key=lambda x: x['avgContinuationQuality'])
print(f"\n{'='*80}")
print("BEST PERFORMERS")
print(f"{'='*80}")
print(f"🏆 Best Efficiency: {bestEfficiency['promptName']} ({bestEfficiency['efficiency']:.1f} chars/iteration)")
print(f"🎯 Best Continuation Quality: {bestQuality['promptName']} ({bestQuality['avgContinuationQuality']:.1f}/10)")
# Test prompt scenarios for GENERIC continuation behavior
# These test different approaches to handle ANY user prompt and ANY data type
PROMPT_SCENARIOS = {
"Prime Numbers Test": """Generate the first 5000 prime numbers in a table with 10 columns per row.""",
"Text Content": """Generate a comprehensive guide about how to bring a new product to market in 10 sections, each containing detailed explanations and examples."""
}
async def main():
"""Run AI behavior testing."""
tester = AIBehaviorTester()
print("Starting AI Behavior Testing...")
print("Initializing AI service...")
await tester.initialize()
print(f"Testing {len(PROMPT_SCENARIOS)} different prompt scenarios")
for promptName, prompt in PROMPT_SCENARIOS.items():
try:
await tester.testPromptBehavior(promptName, prompt, maxIterations=2)
except Exception as e:
print(f"❌ Failed to test {promptName}: {str(e)}")
tester.printBehaviorResults()
if __name__ == "__main__":
asyncio.run(main())