#!/usr/bin/env python3 # Copyright (c) 2025 Patrick Motsch # All rights reserved. """ AI Behavior Test - Tests actual AI responses with different prompt structures """ import asyncio import json import sys import os from typing import Dict, Any, List # Add the gateway to path (go up 2 levels from tests/functional/) _gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) if _gateway_path not in sys.path: sys.path.insert(0, _gateway_path) # Import the service initialization from modules.services import getInterface as getServices from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum from modules.datamodels.datamodelUam import User from modules.datamodels.datamodelWorkflow import AiResponse # The test uses the AI service which handles JSON template internally class AIBehaviorTester: def __init__(self): # Use root user for testing (has full access to everything) from modules.interfaces.interfaceDbAppObjects import getRootInterface rootInterface = getRootInterface() self.testUser = rootInterface.currentUser # Initialize services using the existing system self.services = getServices(self.testUser, None) # Test user, no workflow self.testResults = [] async def initialize(self): """Initialize the AI service.""" # Set logging level to DEBUG to see debug messages import logging logging.getLogger().setLevel(logging.DEBUG) # Create and save workflow in database using the interface from modules.datamodels.datamodelChat import ChatWorkflow, WorkflowModeEnum import uuid import time import modules.interfaces.interfaceDbChatObjects as interfaceDbChatObjects currentTimestamp = time.time() testWorkflow = ChatWorkflow( id=str(uuid.uuid4()), name="Test Workflow", status="running", startedAt=currentTimestamp, lastActivity=currentTimestamp, currentRound=1, currentTask=0, currentAction=0, totalTasks=0, totalActions=0, mandateId=self.testUser.mandateId, messageIds=[], workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC, maxSteps=5 ) # SAVE workflow to database so it exists for access control interfaceDbChat = interfaceDbChatObjects.getInterface(self.testUser) workflowDict = testWorkflow.model_dump() interfaceDbChat.createWorkflow(workflowDict) # Set the workflow in services (Services class uses .workflow, not .currentWorkflow) self.services.workflow = testWorkflow async def testPromptBehavior(self, promptName: str, prompt: str, maxIterations: int = 2) -> Dict[str, Any]: """Test actual AI behavior with a specific prompt structure.""" print(f"\n{'='*60}") print(f"TESTING AI BEHAVIOR: {promptName}") print(f"{'='*60}") print(f"User prompt: {prompt}") print(f"Prompt length: {len(prompt)} characters") accumulatedContent = [] # Use the AI service directly with the user prompt - it will build the generation prompt internally try: # Use callAiContent (replaces deprecated callAiDocuments) options = AiCallOptions( operationType=OperationTypeEnum.DATA_GENERATE ) aiResponse: AiResponse = await self.services.ai.callAiContent( prompt=prompt, # Use the raw user prompt directly options=options, outputFormat="json", title="Prime Numbers Test" ) # Extract content from AiResponse if isinstance(aiResponse, AiResponse): result = aiResponse.content if aiResponse.content else json.dumps({}) elif isinstance(aiResponse, dict): result = json.dumps(aiResponse, indent=2) else: result = str(aiResponse) print(f"Response length: {len(result)} characters") print(f"Response preview: {result[:200]}...") # If we got an error response, try to extract the actual AI content from debug files if isinstance(aiResponse, AiResponse) and aiResponse.metadata and hasattr(aiResponse.metadata, 'error'): # The AI service wrapped the response in an error format # We need to get the actual AI content from the debug files print("⚠️ AI returned error response, but may have generated content") # Try to read the actual AI response from debug files debug_content = self._getLatestDebugResponse() if debug_content: result = debug_content print(f"📄 Found debug content: {len(result)} characters") print(f"📄 Debug preview: {result[:200]}...") # Parse and analyze response parsed_result = self._parseJsonResponse(result) if parsed_result: # Check if continuation if parsed_result.get("continuation") is not None: continuation_text = parsed_result.get("continuation", "") print(f"✅ Continuation detected: {continuation_text[:100]}...") accumulatedContent.append(result) # Analyze continuation quality continuation_quality = self._analyzeContinuationQuality(continuation_text) print(f" Continuation quality: {continuation_quality['score']}/10") print(f" Issues: {', '.join(continuation_quality['issues'])}") else: print("✅ Final response received") accumulatedContent.append(result) else: print("❌ Invalid JSON response") accumulatedContent.append(result) except Exception as e: import traceback print(f"❌ Error in AI call: {type(e).__name__}: {str(e)}") print(f" Traceback: {traceback.format_exc()}") accumulatedContent.append("") # Analyze results result = self._analyzeBehaviorResults(promptName, accumulatedContent) self.testResults.append(result) return result def _extractContinuationInstruction(self, response: str) -> str: """Extract continuation instruction from response.""" try: parsed = json.loads(response) return parsed.get("continuation", "") except: return "" def _getLatestDebugResponse(self) -> str: """Get the latest AI response from debug files.""" try: import glob # Look for the most recent debug response file (go up 2 levels from tests/functional/ to gateway/, then up 1 to poweron/) gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) gateway_dir = os.path.dirname(gateway_path) debug_pattern = os.path.join(gateway_dir, "local", "logs", "debug", "prompts", "*document_generation_response*.txt") debug_files = glob.glob(debug_pattern) if debug_files: # Sort by modification time, get the most recent latest_file = max(debug_files, key=os.path.getmtime) with open(latest_file, 'r', encoding='utf-8') as f: return f.read() return "" except Exception as e: print(f"Error reading debug file: {e}") return "" def _parseJsonResponse(self, response: str) -> Dict[str, Any]: """Parse JSON response.""" try: # First try direct JSON parsing return json.loads(response) except: try: # Try extracting JSON from markdown code blocks if "```json" in response: start = response.find("```json") + 7 end = response.find("```", start) if end > start: json_str = response[start:end].strip() return json.loads(json_str) elif "```" in response: start = response.find("```") + 3 end = response.find("```", start) if end > start: json_str = response[start:end].strip() return json.loads(json_str) return None except: return None def _analyzeContinuationQuality(self, continuation_text: str) -> Dict[str, Any]: """Analyze the quality of continuation instructions.""" score = 10 issues = [] try: # Parse the continuation object if isinstance(continuation_text, str): continuation_obj = json.loads(continuation_text) else: continuation_obj = continuation_text # Check for required fields if not isinstance(continuation_obj, dict): score -= 5 issues.append("Not a valid object") return {"score": max(0, score), "issues": issues} # Check for last_data_items if "last_data_items" not in continuation_obj: score -= 3 issues.append("Missing last_data_items") elif not continuation_obj["last_data_items"]: score -= 2 issues.append("Empty last_data_items") # Check for next_instruction if "next_instruction" not in continuation_obj: score -= 3 issues.append("Missing next_instruction") elif not continuation_obj["next_instruction"]: score -= 2 issues.append("Empty next_instruction") # Check for specific data points in last_data_items if "last_data_items" in continuation_obj: last_items = continuation_obj["last_data_items"] if not any(char.isdigit() for char in str(last_items)): score -= 1 issues.append("No specific numbers in last_data_items") # Check for clear instruction in next_instruction if "next_instruction" in continuation_obj: instruction = continuation_obj["next_instruction"] if "continue" not in instruction.lower(): score -= 1 issues.append("No 'continue' in next_instruction") except (json.JSONDecodeError, TypeError): score -= 5 issues.append("Invalid JSON format") return { "score": max(0, score), "issues": issues } def _analyzeBehaviorResults(self, promptName: str, accumulatedContent: List[str]) -> Dict[str, Any]: """Analyze AI behavior results.""" totalContentLength = 0 iterations = len(accumulatedContent) continuationInstructions = [] continuationQualities = [] for i, content in enumerate(accumulatedContent): parsed = self._parseJsonResponse(content) if parsed: # Count content length in the response contentLength = len(content) totalContentLength += contentLength continuation = parsed.get("continuation") if continuation: continuationInstructions.append(continuation) quality = self._analyzeContinuationQuality(continuation) continuationQualities.append(quality) # Calculate averages avgContinuationQuality = sum(q["score"] for q in continuationQualities) / len(continuationQualities) if continuationQualities else 0 return { "promptName": promptName, "iterations": iterations, "totalContentLength": totalContentLength, "continuationInstructions": continuationInstructions, "avgContinuationQuality": avgContinuationQuality, "success": totalContentLength > 0, "efficiency": totalContentLength / iterations if iterations > 0 else 0 } def _countPrimesInResponse(self, parsed: Dict[str, Any]) -> int: """Count prime numbers in the parsed response.""" count = 0 if "documents" in parsed: for doc in parsed["documents"]: if "sections" in doc: for section in doc["sections"]: if section.get("content_type") == "table" and "elements" in section: for element in section["elements"]: if "rows" in element: for row in element["rows"]: for cell in row: if isinstance(cell, (str, int)) and str(cell).isdigit(): count += 1 return count def printBehaviorResults(self): """Print AI behavior test results.""" print(f"\n{'='*80}") print("AI BEHAVIOR TEST RESULTS") print(f"{'='*80}") for result in self.testResults: print(f"\n{result['promptName']}:") print(f" Iterations: {result['iterations']}") print(f" Total Content Length: {result['totalContentLength']}") print(f" Efficiency: {result['efficiency']:.1f} chars/iteration") print(f" Avg Continuation Quality: {result['avgContinuationQuality']:.1f}/10") print(f" Success: {'✅' if result['success'] else '❌'}") if result['continuationInstructions']: print(f" Continuation Instructions:") for i, instruction in enumerate(result['continuationInstructions']): print(f" {i+1}: {instruction[:80]}...") # Find best performing prompt if self.testResults: bestEfficiency = max(self.testResults, key=lambda x: x['efficiency']) bestQuality = max(self.testResults, key=lambda x: x['avgContinuationQuality']) print(f"\n{'='*80}") print("BEST PERFORMERS") print(f"{'='*80}") print(f"🏆 Best Efficiency: {bestEfficiency['promptName']} ({bestEfficiency['efficiency']:.1f} chars/iteration)") print(f"🎯 Best Continuation Quality: {bestQuality['promptName']} ({bestQuality['avgContinuationQuality']:.1f}/10)") # Test prompt scenarios for GENERIC continuation behavior # These test different approaches to handle ANY user prompt and ANY data type PROMPT_SCENARIOS = { "Prime Numbers Test": """Generate the first 5000 prime numbers in a table with 10 columns per row.""", "Text Content": """Generate a comprehensive guide about how to bring a new product to market in 10 sections, each containing detailed explanations and examples.""" } async def main(): """Run AI behavior testing.""" tester = AIBehaviorTester() print("Starting AI Behavior Testing...") print("Initializing AI service...") await tester.initialize() print(f"Testing {len(PROMPT_SCENARIOS)} different prompt scenarios") for promptName, prompt in PROMPT_SCENARIOS.items(): try: await tester.testPromptBehavior(promptName, prompt, maxIterations=2) except Exception as e: print(f"❌ Failed to test {promptName}: {str(e)}") tester.printBehaviorResults() if __name__ == "__main__": asyncio.run(main())