#!/usr/bin/env python3 """ AI Behavior Test - Tests actual AI responses with different prompt structures """ import asyncio import json import sys import os from typing import Dict, Any, List # Add the gateway to path sys.path.append(os.path.dirname(__file__)) # Import the service initialization from modules.features.chatPlayground.mainChatPlayground import getServices from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum from modules.datamodels.datamodelUam import User # The test uses the AI service which handles JSON template internally class AIBehaviorTester: def __init__(self): # Create a minimal user context for testing testUser = User( id="test_user", username="test_user", email="test@example.com", fullName="Test User", language="en", mandateId="test_mandate" ) # Initialize services using the existing system self.services = getServices(testUser, None) # Test user, no workflow self.testResults = [] async def initialize(self): """Initialize the AI service.""" # Set logging level to DEBUG to see debug messages import logging logging.getLogger().setLevel(logging.DEBUG) # The AI service needs to be recreated with proper initialization from modules.services.serviceAi.mainServiceAi import AiService self.services.ai = await AiService.create(self.services) # Create a minimal workflow context from modules.datamodels.datamodelChat import ChatWorkflow import uuid self.services.currentWorkflow = ChatWorkflow( id=str(uuid.uuid4()), name="Test Workflow", status="running", startedAt=self.services.utils.timestampGetUtc(), lastActivity=self.services.utils.timestampGetUtc(), currentRound=1, currentTask=0, currentAction=0, totalTasks=0, totalActions=0, mandateId="test_mandate", messageIds=[], workflowMode="React", maxSteps=5 ) async def testPromptBehavior(self, promptName: str, prompt: str, maxIterations: int = 2) -> Dict[str, Any]: """Test actual AI behavior with a specific prompt structure.""" print(f"\n{'='*60}") print(f"TESTING AI BEHAVIOR: {promptName}") print(f"{'='*60}") print(f"User prompt: {prompt}") print(f"Prompt length: {len(prompt)} characters") accumulatedContent = [] # Use the AI service directly with the user prompt - it will build the generation prompt internally try: # Use the existing AI service with JSON format - it handles looping internally response = await self.services.ai.callAiDocuments( prompt=prompt, # Use the raw user prompt directly documents=None, outputFormat="json", title="Prime Numbers Test" ) if isinstance(response, dict): result = json.dumps(response, indent=2) else: result = str(response) print(f"Response length: {len(result)} characters") print(f"Response preview: {result[:200]}...") # If we got an error response, try to extract the actual AI content from debug files if isinstance(response, dict) and not response.get("success", True): # The AI service wrapped the response in an error format # We need to get the actual AI content from the debug files print("⚠️ AI returned error response, but may have generated content") # Try to read the actual AI response from debug files debug_content = self._getLatestDebugResponse() if debug_content: result = debug_content print(f"📄 Found debug content: {len(result)} characters") print(f"📄 Debug preview: {result[:200]}...") # Parse and analyze response parsed_result = self._parseJsonResponse(result) if parsed_result: # Check if continuation if parsed_result.get("continuation") is not None: continuation_text = parsed_result.get("continuation", "") print(f"✅ Continuation detected: {continuation_text[:100]}...") accumulatedContent.append(result) # Analyze continuation quality continuation_quality = self._analyzeContinuationQuality(continuation_text) print(f" Continuation quality: {continuation_quality['score']}/10") print(f" Issues: {', '.join(continuation_quality['issues'])}") else: print("✅ Final response received") accumulatedContent.append(result) else: print("❌ Invalid JSON response") accumulatedContent.append(result) except Exception as e: print(f"❌ Error in AI call: {str(e)}") accumulatedContent.append("") # Analyze results result = self._analyzeBehaviorResults(promptName, accumulatedContent) self.testResults.append(result) return result def _extractContinuationInstruction(self, response: str) -> str: """Extract continuation instruction from response.""" try: parsed = json.loads(response) return parsed.get("continuation", "") except: return "" def _getLatestDebugResponse(self) -> str: """Get the latest AI response from debug files.""" try: import glob import os # Look for the most recent debug response file debug_pattern = "local/logs/debug/prompts/*document_generation_response*.txt" debug_files = glob.glob(debug_pattern) if debug_files: # Sort by modification time, get the most recent latest_file = max(debug_files, key=os.path.getmtime) with open(latest_file, 'r', encoding='utf-8') as f: return f.read() return "" except Exception as e: print(f"Error reading debug file: {e}") return "" def _parseJsonResponse(self, response: str) -> Dict[str, Any]: """Parse JSON response.""" try: # First try direct JSON parsing return json.loads(response) except: try: # Try extracting JSON from markdown code blocks if "```json" in response: start = response.find("```json") + 7 end = response.find("```", start) if end > start: json_str = response[start:end].strip() return json.loads(json_str) elif "```" in response: start = response.find("```") + 3 end = response.find("```", start) if end > start: json_str = response[start:end].strip() return json.loads(json_str) return None except: return None def _analyzeContinuationQuality(self, continuation_text: str) -> Dict[str, Any]: """Analyze the quality of continuation instructions.""" score = 10 issues = [] try: # Parse the continuation object if isinstance(continuation_text, str): continuation_obj = json.loads(continuation_text) else: continuation_obj = continuation_text # Check for required fields if not isinstance(continuation_obj, dict): score -= 5 issues.append("Not a valid object") return {"score": max(0, score), "issues": issues} # Check for last_data_items if "last_data_items" not in continuation_obj: score -= 3 issues.append("Missing last_data_items") elif not continuation_obj["last_data_items"]: score -= 2 issues.append("Empty last_data_items") # Check for next_instruction if "next_instruction" not in continuation_obj: score -= 3 issues.append("Missing next_instruction") elif not continuation_obj["next_instruction"]: score -= 2 issues.append("Empty next_instruction") # Check for specific data points in last_data_items if "last_data_items" in continuation_obj: last_items = continuation_obj["last_data_items"] if not any(char.isdigit() for char in str(last_items)): score -= 1 issues.append("No specific numbers in last_data_items") # Check for clear instruction in next_instruction if "next_instruction" in continuation_obj: instruction = continuation_obj["next_instruction"] if "continue" not in instruction.lower(): score -= 1 issues.append("No 'continue' in next_instruction") except (json.JSONDecodeError, TypeError): score -= 5 issues.append("Invalid JSON format") return { "score": max(0, score), "issues": issues } def _analyzeBehaviorResults(self, promptName: str, accumulatedContent: List[str]) -> Dict[str, Any]: """Analyze AI behavior results.""" totalContentLength = 0 iterations = len(accumulatedContent) continuationInstructions = [] continuationQualities = [] for i, content in enumerate(accumulatedContent): parsed = self._parseJsonResponse(content) if parsed: # Count content length in the response contentLength = len(content) totalContentLength += contentLength continuation = parsed.get("continuation") if continuation: continuationInstructions.append(continuation) quality = self._analyzeContinuationQuality(continuation) continuationQualities.append(quality) # Calculate averages avgContinuationQuality = sum(q["score"] for q in continuationQualities) / len(continuationQualities) if continuationQualities else 0 return { "promptName": promptName, "iterations": iterations, "totalContentLength": totalContentLength, "continuationInstructions": continuationInstructions, "avgContinuationQuality": avgContinuationQuality, "success": totalContentLength > 0, "efficiency": totalContentLength / iterations if iterations > 0 else 0 } def _countPrimesInResponse(self, parsed: Dict[str, Any]) -> int: """Count prime numbers in the parsed response.""" count = 0 if "documents" in parsed: for doc in parsed["documents"]: if "sections" in doc: for section in doc["sections"]: if section.get("content_type") == "table" and "elements" in section: for element in section["elements"]: if "rows" in element: for row in element["rows"]: for cell in row: if isinstance(cell, (str, int)) and str(cell).isdigit(): count += 1 return count def printBehaviorResults(self): """Print AI behavior test results.""" print(f"\n{'='*80}") print("AI BEHAVIOR TEST RESULTS") print(f"{'='*80}") for result in self.testResults: print(f"\n{result['promptName']}:") print(f" Iterations: {result['iterations']}") print(f" Total Content Length: {result['totalContentLength']}") print(f" Efficiency: {result['efficiency']:.1f} chars/iteration") print(f" Avg Continuation Quality: {result['avgContinuationQuality']:.1f}/10") print(f" Success: {'✅' if result['success'] else '❌'}") if result['continuationInstructions']: print(f" Continuation Instructions:") for i, instruction in enumerate(result['continuationInstructions']): print(f" {i+1}: {instruction[:80]}...") # Find best performing prompt if self.testResults: bestEfficiency = max(self.testResults, key=lambda x: x['efficiency']) bestQuality = max(self.testResults, key=lambda x: x['avgContinuationQuality']) print(f"\n{'='*80}") print("BEST PERFORMERS") print(f"{'='*80}") print(f"🏆 Best Efficiency: {bestEfficiency['promptName']} ({bestEfficiency['efficiency']:.1f} chars/iteration)") print(f"🎯 Best Continuation Quality: {bestQuality['promptName']} ({bestQuality['avgContinuationQuality']:.1f}/10)") # Test prompt scenarios for GENERIC continuation behavior # These test different approaches to handle ANY user prompt and ANY data type PROMPT_SCENARIOS = { "Prime Numbers Test": """Generate the first 5000 prime numbers in a table with 10 columns per row.""", "Text Content": """Generate a comprehensive guide about how to bring a new product to market in 10 sections, each containing detailed explanations and examples.""" } async def main(): """Run AI behavior testing.""" tester = AIBehaviorTester() print("Starting AI Behavior Testing...") print("Initializing AI service...") await tester.initialize() print(f"Testing {len(PROMPT_SCENARIOS)} different prompt scenarios") for promptName, prompt in PROMPT_SCENARIOS.items(): try: await tester.testPromptBehavior(promptName, prompt, maxIterations=2) except Exception as e: print(f"❌ Failed to test {promptName}: {str(e)}") tester.printBehaviorResults() if __name__ == "__main__": asyncio.run(main())