gateway/tests/functional/test04_ai_behavior.py
2025-11-17 23:12:18 +01:00

375 lines
16 KiB
Python

#!/usr/bin/env python3
"""
AI Behavior Test - Tests actual AI responses with different prompt structures
"""
import asyncio
import json
import sys
import os
from typing import Dict, Any, List
# Add the gateway to path (go up 2 levels from tests/functional/)
_gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if _gateway_path not in sys.path:
sys.path.insert(0, _gateway_path)
# Import the service initialization
from modules.services import getInterface as getServices
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelWorkflow import AiResponse
# The test uses the AI service which handles JSON template internally
class AIBehaviorTester:
def __init__(self):
# Use root user for testing (has full access to everything)
from modules.interfaces.interfaceDbAppObjects import getRootInterface
rootInterface = getRootInterface()
self.testUser = rootInterface.currentUser
# Initialize services using the existing system
self.services = getServices(self.testUser, None) # Test user, no workflow
self.testResults = []
async def initialize(self):
"""Initialize the AI service."""
# Set logging level to DEBUG to see debug messages
import logging
logging.getLogger().setLevel(logging.DEBUG)
# Create and save workflow in database using the interface
from modules.datamodels.datamodelChat import ChatWorkflow, WorkflowModeEnum
import uuid
import time
import modules.interfaces.interfaceDbChatObjects as interfaceDbChatObjects
currentTimestamp = time.time()
testWorkflow = ChatWorkflow(
id=str(uuid.uuid4()),
name="Test Workflow",
status="running",
startedAt=currentTimestamp,
lastActivity=currentTimestamp,
currentRound=1,
currentTask=0,
currentAction=0,
totalTasks=0,
totalActions=0,
mandateId=self.testUser.mandateId,
messageIds=[],
workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC,
maxSteps=5
)
# SAVE workflow to database so it exists for access control
interfaceDbChat = interfaceDbChatObjects.getInterface(self.testUser)
workflowDict = testWorkflow.model_dump()
interfaceDbChat.createWorkflow(workflowDict)
# Set the workflow in services (Services class uses .workflow, not .currentWorkflow)
self.services.workflow = testWorkflow
async def testPromptBehavior(self, promptName: str, prompt: str, maxIterations: int = 2) -> Dict[str, Any]:
"""Test actual AI behavior with a specific prompt structure."""
print(f"\n{'='*60}")
print(f"TESTING AI BEHAVIOR: {promptName}")
print(f"{'='*60}")
print(f"User prompt: {prompt}")
print(f"Prompt length: {len(prompt)} characters")
accumulatedContent = []
# Use the AI service directly with the user prompt - it will build the generation prompt internally
try:
# Use callAiContent (replaces deprecated callAiDocuments)
options = AiCallOptions(
operationType=OperationTypeEnum.DATA_GENERATE
)
aiResponse: AiResponse = await self.services.ai.callAiContent(
prompt=prompt, # Use the raw user prompt directly
options=options,
outputFormat="json",
title="Prime Numbers Test"
)
# Extract content from AiResponse
if isinstance(aiResponse, AiResponse):
result = aiResponse.content if aiResponse.content else json.dumps({})
elif isinstance(aiResponse, dict):
result = json.dumps(aiResponse, indent=2)
else:
result = str(aiResponse)
print(f"Response length: {len(result)} characters")
print(f"Response preview: {result[:200]}...")
# If we got an error response, try to extract the actual AI content from debug files
if isinstance(aiResponse, AiResponse) and aiResponse.metadata and hasattr(aiResponse.metadata, 'error'):
# The AI service wrapped the response in an error format
# We need to get the actual AI content from the debug files
print("⚠️ AI returned error response, but may have generated content")
# Try to read the actual AI response from debug files
debug_content = self._getLatestDebugResponse()
if debug_content:
result = debug_content
print(f"📄 Found debug content: {len(result)} characters")
print(f"📄 Debug preview: {result[:200]}...")
# Parse and analyze response
parsed_result = self._parseJsonResponse(result)
if parsed_result:
# Check if continuation
if parsed_result.get("continuation") is not None:
continuation_text = parsed_result.get("continuation", "")
print(f"✅ Continuation detected: {continuation_text[:100]}...")
accumulatedContent.append(result)
# Analyze continuation quality
continuation_quality = self._analyzeContinuationQuality(continuation_text)
print(f" Continuation quality: {continuation_quality['score']}/10")
print(f" Issues: {', '.join(continuation_quality['issues'])}")
else:
print("✅ Final response received")
accumulatedContent.append(result)
else:
print("❌ Invalid JSON response")
accumulatedContent.append(result)
except Exception as e:
import traceback
print(f"❌ Error in AI call: {type(e).__name__}: {str(e)}")
print(f" Traceback: {traceback.format_exc()}")
accumulatedContent.append("")
# Analyze results
result = self._analyzeBehaviorResults(promptName, accumulatedContent)
self.testResults.append(result)
return result
def _extractContinuationInstruction(self, response: str) -> str:
"""Extract continuation instruction from response."""
try:
parsed = json.loads(response)
return parsed.get("continuation", "")
except:
return ""
def _getLatestDebugResponse(self) -> str:
"""Get the latest AI response from debug files."""
try:
import glob
# Look for the most recent debug response file (go up 2 levels from tests/functional/ to gateway/, then up 1 to poweron/)
gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
gateway_dir = os.path.dirname(gateway_path)
debug_pattern = os.path.join(gateway_dir, "local", "logs", "debug", "prompts", "*document_generation_response*.txt")
debug_files = glob.glob(debug_pattern)
if debug_files:
# Sort by modification time, get the most recent
latest_file = max(debug_files, key=os.path.getmtime)
with open(latest_file, 'r', encoding='utf-8') as f:
return f.read()
return ""
except Exception as e:
print(f"Error reading debug file: {e}")
return ""
def _parseJsonResponse(self, response: str) -> Dict[str, Any]:
"""Parse JSON response."""
try:
# First try direct JSON parsing
return json.loads(response)
except:
try:
# Try extracting JSON from markdown code blocks
if "```json" in response:
start = response.find("```json") + 7
end = response.find("```", start)
if end > start:
json_str = response[start:end].strip()
return json.loads(json_str)
elif "```" in response:
start = response.find("```") + 3
end = response.find("```", start)
if end > start:
json_str = response[start:end].strip()
return json.loads(json_str)
return None
except:
return None
def _analyzeContinuationQuality(self, continuation_text: str) -> Dict[str, Any]:
"""Analyze the quality of continuation instructions."""
score = 10
issues = []
try:
# Parse the continuation object
if isinstance(continuation_text, str):
continuation_obj = json.loads(continuation_text)
else:
continuation_obj = continuation_text
# Check for required fields
if not isinstance(continuation_obj, dict):
score -= 5
issues.append("Not a valid object")
return {"score": max(0, score), "issues": issues}
# Check for last_data_items
if "last_data_items" not in continuation_obj:
score -= 3
issues.append("Missing last_data_items")
elif not continuation_obj["last_data_items"]:
score -= 2
issues.append("Empty last_data_items")
# Check for next_instruction
if "next_instruction" not in continuation_obj:
score -= 3
issues.append("Missing next_instruction")
elif not continuation_obj["next_instruction"]:
score -= 2
issues.append("Empty next_instruction")
# Check for specific data points in last_data_items
if "last_data_items" in continuation_obj:
last_items = continuation_obj["last_data_items"]
if not any(char.isdigit() for char in str(last_items)):
score -= 1
issues.append("No specific numbers in last_data_items")
# Check for clear instruction in next_instruction
if "next_instruction" in continuation_obj:
instruction = continuation_obj["next_instruction"]
if "continue" not in instruction.lower():
score -= 1
issues.append("No 'continue' in next_instruction")
except (json.JSONDecodeError, TypeError):
score -= 5
issues.append("Invalid JSON format")
return {
"score": max(0, score),
"issues": issues
}
def _analyzeBehaviorResults(self, promptName: str, accumulatedContent: List[str]) -> Dict[str, Any]:
"""Analyze AI behavior results."""
totalContentLength = 0
iterations = len(accumulatedContent)
continuationInstructions = []
continuationQualities = []
for i, content in enumerate(accumulatedContent):
parsed = self._parseJsonResponse(content)
if parsed:
# Count content length in the response
contentLength = len(content)
totalContentLength += contentLength
continuation = parsed.get("continuation")
if continuation:
continuationInstructions.append(continuation)
quality = self._analyzeContinuationQuality(continuation)
continuationQualities.append(quality)
# Calculate averages
avgContinuationQuality = sum(q["score"] for q in continuationQualities) / len(continuationQualities) if continuationQualities else 0
return {
"promptName": promptName,
"iterations": iterations,
"totalContentLength": totalContentLength,
"continuationInstructions": continuationInstructions,
"avgContinuationQuality": avgContinuationQuality,
"success": totalContentLength > 0,
"efficiency": totalContentLength / iterations if iterations > 0 else 0
}
def _countPrimesInResponse(self, parsed: Dict[str, Any]) -> int:
"""Count prime numbers in the parsed response."""
count = 0
if "documents" in parsed:
for doc in parsed["documents"]:
if "sections" in doc:
for section in doc["sections"]:
if section.get("content_type") == "table" and "elements" in section:
for element in section["elements"]:
if "rows" in element:
for row in element["rows"]:
for cell in row:
if isinstance(cell, (str, int)) and str(cell).isdigit():
count += 1
return count
def printBehaviorResults(self):
"""Print AI behavior test results."""
print(f"\n{'='*80}")
print("AI BEHAVIOR TEST RESULTS")
print(f"{'='*80}")
for result in self.testResults:
print(f"\n{result['promptName']}:")
print(f" Iterations: {result['iterations']}")
print(f" Total Content Length: {result['totalContentLength']}")
print(f" Efficiency: {result['efficiency']:.1f} chars/iteration")
print(f" Avg Continuation Quality: {result['avgContinuationQuality']:.1f}/10")
print(f" Success: {'' if result['success'] else ''}")
if result['continuationInstructions']:
print(f" Continuation Instructions:")
for i, instruction in enumerate(result['continuationInstructions']):
print(f" {i+1}: {instruction[:80]}...")
# Find best performing prompt
if self.testResults:
bestEfficiency = max(self.testResults, key=lambda x: x['efficiency'])
bestQuality = max(self.testResults, key=lambda x: x['avgContinuationQuality'])
print(f"\n{'='*80}")
print("BEST PERFORMERS")
print(f"{'='*80}")
print(f"🏆 Best Efficiency: {bestEfficiency['promptName']} ({bestEfficiency['efficiency']:.1f} chars/iteration)")
print(f"🎯 Best Continuation Quality: {bestQuality['promptName']} ({bestQuality['avgContinuationQuality']:.1f}/10)")
# Test prompt scenarios for GENERIC continuation behavior
# These test different approaches to handle ANY user prompt and ANY data type
PROMPT_SCENARIOS = {
"Prime Numbers Test": """Generate the first 5000 prime numbers in a table with 10 columns per row.""",
"Text Content": """Generate a comprehensive guide about how to bring a new product to market in 10 sections, each containing detailed explanations and examples."""
}
async def main():
"""Run AI behavior testing."""
tester = AIBehaviorTester()
print("Starting AI Behavior Testing...")
print("Initializing AI service...")
await tester.initialize()
print(f"Testing {len(PROMPT_SCENARIOS)} different prompt scenarios")
for promptName, prompt in PROMPT_SCENARIOS.items():
try:
await tester.testPromptBehavior(promptName, prompt, maxIterations=2)
except Exception as e:
print(f"❌ Failed to test {promptName}: {str(e)}")
tester.printBehaviorResults()
if __name__ == "__main__":
asyncio.run(main())