gateway/test_ai_behavior.py

#!/usr/bin/env python3
"""
AI Behavior Test - Tests actual AI responses with different prompt structures
"""

import asyncio
import json
import sys
import os
from typing import Dict, Any, List

# Add the gateway to path
sys.path.append(os.path.dirname(__file__))

# Import the service initialization
from modules.features.chatPlayground.mainChatPlayground import getServices
from modules.datamodels.datamodelAi import AiCallOptions, OperationType
from modules.datamodels.datamodelUam import User

# The test uses the AI service which handles JSON template internally

class AIBehaviorTester:
    def __init__(self):
        # Create a minimal user context for testing
        testUser = User(
            id="test_user",
            username="test_user",
            email="test@example.com",
            fullName="Test User",
            language="en",
            mandateId="test_mandate"
        )

        # Initialize services using the existing system
        self.services = getServices(testUser, None)  # Test user, no workflow
        self.testResults = []

    async def initialize(self):
        """Initialize the AI service."""
        # Set logging level to DEBUG to see debug messages
        import logging
        logging.getLogger().setLevel(logging.DEBUG)

        # The AI service needs to be recreated with proper initialization
        from modules.services.serviceAi.mainServiceAi import AiService
        self.services.ai = await AiService.create(self.services)

        # Create a minimal workflow context
        from modules.datamodels.datamodelChat import ChatWorkflow
        import uuid

        self.services.currentWorkflow = ChatWorkflow(
            id=str(uuid.uuid4()),
            name="Test Workflow",
            status="running",
            startedAt=self.services.utils.timestampGetUtc(),
            lastActivity=self.services.utils.timestampGetUtc(),
            currentRound=1,
            currentTask=0,
            currentAction=0,
            totalTasks=0,
            totalActions=0,
            mandateId="test_mandate",
            messageIds=[],
            workflowMode="React",
            maxSteps=5
        )

    async def testPromptBehavior(self, promptName: str, prompt: str, maxIterations: int = 2) -> Dict[str, Any]:
        """Test actual AI behavior with a specific prompt structure."""
        print(f"\n{'='*60}")
        print(f"TESTING AI BEHAVIOR: {promptName}")
        print(f"{'='*60}")

        print(f"User prompt: {prompt}")
        print(f"Prompt length: {len(prompt)} characters")

        accumulatedContent = []

        # Use the AI service directly with the user prompt - it will build the generation prompt internally
        try:
            # Use the existing AI service with JSON format - it handles looping internally
            response = await self.services.ai.coreAi.callAiDocuments(
                prompt=prompt,  # Use the raw user prompt directly
                documents=None,
                outputFormat="json",
                title="Prime Numbers Test",
                loopInstructionFormat="json"  # Use the JSON loop instructions
            )

            if isinstance(response, dict):
                result = json.dumps(response, indent=2)
            else:
                result = str(response)

            print(f"Response length: {len(result)} characters")
            print(f"Response preview: {result[:200]}...")

            # If we got an error response, try to extract the actual AI content from debug files
            if isinstance(response, dict) and not response.get("success", True):
                # The AI service wrapped the response in an error format
                # We need to get the actual AI content from the debug files
                print("⚠️  AI returned error response, but may have generated content")

                # Try to read the actual AI response from debug files
                debug_content = self._getLatestDebugResponse()
                if debug_content:
                    result = debug_content
                    print(f"📄 Found debug content: {len(result)} characters")
                    print(f"📄 Debug preview: {result[:200]}...")

            # Parse and analyze response
            parsed_result = self._parseJsonResponse(result)
            if parsed_result:
                # Check if continuation
                if parsed_result.get("continuation") is not None:
                    continuation_text = parsed_result.get("continuation", "")
                    print(f"✅ Continuation detected: {continuation_text[:100]}...")
                    accumulatedContent.append(result)

                    # Analyze continuation quality
                    continuation_quality = self._analyzeContinuationQuality(continuation_text)
                    print(f"   Continuation quality: {continuation_quality['score']}/10")
                    print(f"   Issues: {', '.join(continuation_quality['issues'])}")
                else:
                    print("✅ Final response received")
                    accumulatedContent.append(result)
            else:
                print("❌ Invalid JSON response")
                accumulatedContent.append(result)

        except Exception as e:
            print(f"❌ Error in AI call: {str(e)}")
            accumulatedContent.append("")

        # Analyze results
        result = self._analyzeBehaviorResults(promptName, accumulatedContent)
        self.testResults.append(result)
        return result


    def _extractContinuationInstruction(self, response: str) -> str:
        """Extract continuation instruction from response."""
        try:
            parsed = json.loads(response)
            return parsed.get("continuation", "")
        except:
            return ""


    def _getLatestDebugResponse(self) -> str:
        """Get the latest AI response from debug files."""
        try:
            import glob
            import os

            # Look for the most recent debug response file
            debug_pattern = "local/logs/debug/prompts/*document_generation_response*.txt"
            debug_files = glob.glob(debug_pattern)

            if debug_files:
                # Sort by modification time, get the most recent
                latest_file = max(debug_files, key=os.path.getmtime)
                with open(latest_file, 'r', encoding='utf-8') as f:
                    return f.read()
            return ""
        except Exception as e:
            print(f"Error reading debug file: {e}")
            return ""

    def _parseJsonResponse(self, response: str) -> Dict[str, Any]:
        """Parse JSON response."""
        try:
            # First try direct JSON parsing
            return json.loads(response)
        except:
            try:
                # Try extracting JSON from markdown code blocks
                if "```json" in response:
                    start = response.find("```json") + 7
                    end = response.find("```", start)
                    if end > start:
                        json_str = response[start:end].strip()
                        return json.loads(json_str)
                elif "```" in response:
                    start = response.find("```") + 3
                    end = response.find("```", start)
                    if end > start:
                        json_str = response[start:end].strip()
                        return json.loads(json_str)
                return None
            except:
                return None

    def _analyzeContinuationQuality(self, continuation_text: str) -> Dict[str, Any]:
        """Analyze the quality of continuation instructions."""
        score = 10
        issues = []

        try:
            # Parse the continuation object
            if isinstance(continuation_text, str):
                continuation_obj = json.loads(continuation_text)
            else:
                continuation_obj = continuation_text

            # Check for required fields
            if not isinstance(continuation_obj, dict):
                score -= 5
                issues.append("Not a valid object")
                return {"score": max(0, score), "issues": issues}

            # Check for last_data_items
            if "last_data_items" not in continuation_obj:
                score -= 3
                issues.append("Missing last_data_items")
            elif not continuation_obj["last_data_items"]:
                score -= 2
                issues.append("Empty last_data_items")

            # Check for next_instruction
            if "next_instruction" not in continuation_obj:
                score -= 3
                issues.append("Missing next_instruction")
            elif not continuation_obj["next_instruction"]:
                score -= 2
                issues.append("Empty next_instruction")

            # Check for specific data points in last_data_items
            if "last_data_items" in continuation_obj:
                last_items = continuation_obj["last_data_items"]
                if not any(char.isdigit() for char in str(last_items)):
                    score -= 1
                    issues.append("No specific numbers in last_data_items")

            # Check for clear instruction in next_instruction
            if "next_instruction" in continuation_obj:
                instruction = continuation_obj["next_instruction"]
                if "continue" not in instruction.lower():
                    score -= 1
                    issues.append("No 'continue' in next_instruction")

        except (json.JSONDecodeError, TypeError):
            score -= 5
            issues.append("Invalid JSON format")

        return {
            "score": max(0, score),
            "issues": issues
        }

    def _analyzeBehaviorResults(self, promptName: str, accumulatedContent: List[str]) -> Dict[str, Any]:
        """Analyze AI behavior results."""
        totalContentLength = 0
        iterations = len(accumulatedContent)
        continuationInstructions = []
        continuationQualities = []

        for i, content in enumerate(accumulatedContent):
            parsed = self._parseJsonResponse(content)
            if parsed:
                # Count content length in the response
                contentLength = len(content)
                totalContentLength += contentLength

                continuation = parsed.get("continuation")
                if continuation:
                    continuationInstructions.append(continuation)
                    quality = self._analyzeContinuationQuality(continuation)
                    continuationQualities.append(quality)

        # Calculate averages
        avgContinuationQuality = sum(q["score"] for q in continuationQualities) / len(continuationQualities) if continuationQualities else 0

        return {
            "promptName": promptName,
            "iterations": iterations,
            "totalContentLength": totalContentLength,
            "continuationInstructions": continuationInstructions,
            "avgContinuationQuality": avgContinuationQuality,
            "success": totalContentLength > 0,
            "efficiency": totalContentLength / iterations if iterations > 0 else 0
        }

    def _countPrimesInResponse(self, parsed: Dict[str, Any]) -> int:
        """Count prime numbers in the parsed response."""
        count = 0

        if "documents" in parsed:
            for doc in parsed["documents"]:
                if "sections" in doc:
                    for section in doc["sections"]:
                        if section.get("content_type") == "table" and "elements" in section:
                            for element in section["elements"]:
                                if "rows" in element:
                                    for row in element["rows"]:
                                        for cell in row:
                                            if isinstance(cell, (str, int)) and str(cell).isdigit():
                                                count += 1

        return count

    def printBehaviorResults(self):
        """Print AI behavior test results."""
        print(f"\n{'='*80}")
        print("AI BEHAVIOR TEST RESULTS")
        print(f"{'='*80}")

        for result in self.testResults:
            print(f"\n{result['promptName']}:")
            print(f"  Iterations: {result['iterations']}")
            print(f"  Total Content Length: {result['totalContentLength']}")
            print(f"  Efficiency: {result['efficiency']:.1f} chars/iteration")
            print(f"  Avg Continuation Quality: {result['avgContinuationQuality']:.1f}/10")
            print(f"  Success: {'✅' if result['success'] else '❌'}")

            if result['continuationInstructions']:
                print(f"  Continuation Instructions:")
                for i, instruction in enumerate(result['continuationInstructions']):
                    print(f"    {i+1}: {instruction[:80]}...")

        # Find best performing prompt
        if self.testResults:
            bestEfficiency = max(self.testResults, key=lambda x: x['efficiency'])
            bestQuality = max(self.testResults, key=lambda x: x['avgContinuationQuality'])

            print(f"\n{'='*80}")
            print("BEST PERFORMERS")
            print(f"{'='*80}")
            print(f"🏆 Best Efficiency: {bestEfficiency['promptName']} ({bestEfficiency['efficiency']:.1f} chars/iteration)")
            print(f"🎯 Best Continuation Quality: {bestQuality['promptName']} ({bestQuality['avgContinuationQuality']:.1f}/10)")

# Test prompt scenarios for GENERIC continuation behavior
# These test different approaches to handle ANY user prompt and ANY data type
PROMPT_SCENARIOS = {
    "Prime Numbers Test": """Generate the first 5000 prime numbers in a table with 10 columns per row.""",

    "Text Content": """Generate a comprehensive guide about how to bring a new product to market in 10 sections, each containing detailed explanations and examples."""
}

async def main():
    """Run AI behavior testing."""
    tester = AIBehaviorTester()

    print("Starting AI Behavior Testing...")
    print("Initializing AI service...")
    await tester.initialize()

    print(f"Testing {len(PROMPT_SCENARIOS)} different prompt scenarios")

    for promptName, prompt in PROMPT_SCENARIOS.items():
        try:
            await tester.testPromptBehavior(promptName, prompt, maxIterations=2)
        except Exception as e:
            print(f"❌ Failed to test {promptName}: {str(e)}")

    tester.printBehaviorResults()

if __name__ == "__main__":
    asyncio.run(main())