serviceCenter = DI-Container (Resolver, Registry, Context) fuer Service-Instanziierung serviceHub = Consumer-facing Aggregation (DB-Interfaces, Runtime-State, lazy Service-Resolution via serviceCenter) - modules/serviceHub/ erstellt: ServiceHub, PublicService, getInterface() - 22 Consumer-Dateien migriert (routes, features, tests): imports von modules.services auf serviceHub bzw. serviceCenter umgestellt - resolver.py: legacy fallback auf altes services/ entfernt - modules/services/ komplett geloescht (83 Dateien inkl. dead code mainAiChat.py) - pre-extraction: progress callback durch chunk-pipeline propagiert, operationType DATA_EXTRACT->DATA_ANALYSE fuer guenstigeres Modell
380 lines
16 KiB
Python
380 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
AI Behavior Test - Tests actual AI responses with different prompt structures
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import sys
|
|
import os
|
|
from typing import Dict, Any, List
|
|
|
|
# Add the gateway to path (go up 2 levels from tests/functional/)
|
|
_gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
if _gateway_path not in sys.path:
|
|
sys.path.insert(0, _gateway_path)
|
|
|
|
# Import the service initialization
|
|
from modules.serviceHub import getInterface as getServices
|
|
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
|
|
from modules.datamodels.datamodelUam import User
|
|
from modules.datamodels.datamodelWorkflow import AiResponse
|
|
|
|
# The test uses the AI service which handles JSON template internally
|
|
|
|
class AIBehaviorTester:
|
|
def __init__(self):
|
|
# Use root user for testing (has full access to everything)
|
|
from modules.interfaces.interfaceDbApp import getRootInterface
|
|
from modules.datamodels.datamodelUam import Mandate
|
|
rootInterface = getRootInterface()
|
|
self.testUser = rootInterface.currentUser
|
|
# Get initial mandate ID for testing (User has no mandateId - use initial mandate)
|
|
self.testMandateId = rootInterface.getInitialId(Mandate)
|
|
|
|
# Initialize services using the existing system
|
|
self.services = getServices(self.testUser, None) # Test user, no workflow
|
|
self.testResults = []
|
|
|
|
async def initialize(self):
|
|
"""Initialize the AI service."""
|
|
# Set logging level to DEBUG to see debug messages
|
|
import logging
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
# Create and save workflow in database using the interface
|
|
from modules.datamodels.datamodelChat import ChatWorkflow, WorkflowModeEnum
|
|
import uuid
|
|
import time
|
|
import modules.interfaces.interfaceDbChat as interfaceFeatureAiChat
|
|
|
|
currentTimestamp = time.time()
|
|
|
|
testWorkflow = ChatWorkflow(
|
|
id=str(uuid.uuid4()),
|
|
name="Test Workflow",
|
|
status="running",
|
|
startedAt=currentTimestamp,
|
|
lastActivity=currentTimestamp,
|
|
currentRound=1,
|
|
currentTask=0,
|
|
currentAction=0,
|
|
totalTasks=0,
|
|
totalActions=0,
|
|
mandateId=self.testMandateId,
|
|
messageIds=[],
|
|
workflowMode=WorkflowModeEnum.WORKFLOW_DYNAMIC,
|
|
maxSteps=5
|
|
)
|
|
|
|
# SAVE workflow to database so it exists for access control
|
|
interfaceDbChat = interfaceDbChat.getInterface(self.testUser)
|
|
workflowDict = testWorkflow.model_dump()
|
|
interfaceDbChat.createWorkflow(workflowDict)
|
|
|
|
# Set the workflow in services (Services class uses .workflow, not .currentWorkflow)
|
|
self.services.workflow = testWorkflow
|
|
|
|
async def testPromptBehavior(self, promptName: str, prompt: str, maxIterations: int = 2) -> Dict[str, Any]:
|
|
"""Test actual AI behavior with a specific prompt structure."""
|
|
print(f"\n{'='*60}")
|
|
print(f"TESTING AI BEHAVIOR: {promptName}")
|
|
print(f"{'='*60}")
|
|
|
|
print(f"User prompt: {prompt}")
|
|
print(f"Prompt length: {len(prompt)} characters")
|
|
|
|
accumulatedContent = []
|
|
|
|
# Use the AI service directly with the user prompt - it will build the generation prompt internally
|
|
try:
|
|
# Use callAiContent (replaces deprecated callAiDocuments)
|
|
options = AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_GENERATE
|
|
)
|
|
aiResponse: AiResponse = await self.services.ai.callAiContent(
|
|
prompt=prompt, # Use the raw user prompt directly
|
|
options=options,
|
|
outputFormat="json",
|
|
title="Prime Numbers Test"
|
|
)
|
|
|
|
# Extract content from AiResponse
|
|
if isinstance(aiResponse, AiResponse):
|
|
result = aiResponse.content if aiResponse.content else json.dumps({})
|
|
elif isinstance(aiResponse, dict):
|
|
result = json.dumps(aiResponse, indent=2)
|
|
else:
|
|
result = str(aiResponse)
|
|
|
|
print(f"Response length: {len(result)} characters")
|
|
print(f"Response preview: {result[:200]}...")
|
|
|
|
# If we got an error response, try to extract the actual AI content from debug files
|
|
if isinstance(aiResponse, AiResponse) and aiResponse.metadata and hasattr(aiResponse.metadata, 'error'):
|
|
# The AI service wrapped the response in an error format
|
|
# We need to get the actual AI content from the debug files
|
|
print("⚠️ AI returned error response, but may have generated content")
|
|
|
|
# Try to read the actual AI response from debug files
|
|
debug_content = self._getLatestDebugResponse()
|
|
if debug_content:
|
|
result = debug_content
|
|
print(f"📄 Found debug content: {len(result)} characters")
|
|
print(f"📄 Debug preview: {result[:200]}...")
|
|
|
|
# Parse and analyze response
|
|
parsed_result = self._parseJsonResponse(result)
|
|
if parsed_result:
|
|
# Check if continuation
|
|
if parsed_result.get("continuation") is not None:
|
|
continuation_text = parsed_result.get("continuation", "")
|
|
print(f"✅ Continuation detected: {continuation_text[:100]}...")
|
|
accumulatedContent.append(result)
|
|
|
|
# Analyze continuation quality
|
|
continuation_quality = self._analyzeContinuationQuality(continuation_text)
|
|
print(f" Continuation quality: {continuation_quality['score']}/10")
|
|
print(f" Issues: {', '.join(continuation_quality['issues'])}")
|
|
else:
|
|
print("✅ Final response received")
|
|
accumulatedContent.append(result)
|
|
else:
|
|
print("❌ Invalid JSON response")
|
|
accumulatedContent.append(result)
|
|
|
|
except Exception as e:
|
|
import traceback
|
|
print(f"❌ Error in AI call: {type(e).__name__}: {str(e)}")
|
|
print(f" Traceback: {traceback.format_exc()}")
|
|
accumulatedContent.append("")
|
|
|
|
# Analyze results
|
|
result = self._analyzeBehaviorResults(promptName, accumulatedContent)
|
|
self.testResults.append(result)
|
|
return result
|
|
|
|
|
|
def _extractContinuationInstruction(self, response: str) -> str:
|
|
"""Extract continuation instruction from response."""
|
|
try:
|
|
parsed = json.loads(response)
|
|
return parsed.get("continuation", "")
|
|
except:
|
|
return ""
|
|
|
|
|
|
def _getLatestDebugResponse(self) -> str:
|
|
"""Get the latest AI response from debug files."""
|
|
try:
|
|
import glob
|
|
|
|
# Look for the most recent debug response file (go up 2 levels from tests/functional/ to gateway/, then up 1 to poweron/)
|
|
gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
gateway_dir = os.path.dirname(gateway_path)
|
|
debug_pattern = os.path.join(gateway_dir, "local", "logs", "debug", "prompts", "*document_generation_response*.txt")
|
|
debug_files = glob.glob(debug_pattern)
|
|
|
|
if debug_files:
|
|
# Sort by modification time, get the most recent
|
|
latest_file = max(debug_files, key=os.path.getmtime)
|
|
with open(latest_file, 'r', encoding='utf-8') as f:
|
|
return f.read()
|
|
return ""
|
|
except Exception as e:
|
|
print(f"Error reading debug file: {e}")
|
|
return ""
|
|
|
|
def _parseJsonResponse(self, response: str) -> Dict[str, Any]:
|
|
"""Parse JSON response."""
|
|
try:
|
|
# First try direct JSON parsing
|
|
return json.loads(response)
|
|
except:
|
|
try:
|
|
# Try extracting JSON from markdown code blocks
|
|
if "```json" in response:
|
|
start = response.find("```json") + 7
|
|
end = response.find("```", start)
|
|
if end > start:
|
|
json_str = response[start:end].strip()
|
|
return json.loads(json_str)
|
|
elif "```" in response:
|
|
start = response.find("```") + 3
|
|
end = response.find("```", start)
|
|
if end > start:
|
|
json_str = response[start:end].strip()
|
|
return json.loads(json_str)
|
|
return None
|
|
except:
|
|
return None
|
|
|
|
def _analyzeContinuationQuality(self, continuation_text: str) -> Dict[str, Any]:
|
|
"""Analyze the quality of continuation instructions."""
|
|
score = 10
|
|
issues = []
|
|
|
|
try:
|
|
# Parse the continuation object
|
|
if isinstance(continuation_text, str):
|
|
continuation_obj = json.loads(continuation_text)
|
|
else:
|
|
continuation_obj = continuation_text
|
|
|
|
# Check for required fields
|
|
if not isinstance(continuation_obj, dict):
|
|
score -= 5
|
|
issues.append("Not a valid object")
|
|
return {"score": max(0, score), "issues": issues}
|
|
|
|
# Check for last_data_items
|
|
if "last_data_items" not in continuation_obj:
|
|
score -= 3
|
|
issues.append("Missing last_data_items")
|
|
elif not continuation_obj["last_data_items"]:
|
|
score -= 2
|
|
issues.append("Empty last_data_items")
|
|
|
|
# Check for next_instruction
|
|
if "next_instruction" not in continuation_obj:
|
|
score -= 3
|
|
issues.append("Missing next_instruction")
|
|
elif not continuation_obj["next_instruction"]:
|
|
score -= 2
|
|
issues.append("Empty next_instruction")
|
|
|
|
# Check for specific data points in last_data_items
|
|
if "last_data_items" in continuation_obj:
|
|
last_items = continuation_obj["last_data_items"]
|
|
if not any(char.isdigit() for char in str(last_items)):
|
|
score -= 1
|
|
issues.append("No specific numbers in last_data_items")
|
|
|
|
# Check for clear instruction in next_instruction
|
|
if "next_instruction" in continuation_obj:
|
|
instruction = continuation_obj["next_instruction"]
|
|
if "continue" not in instruction.lower():
|
|
score -= 1
|
|
issues.append("No 'continue' in next_instruction")
|
|
|
|
except (json.JSONDecodeError, TypeError):
|
|
score -= 5
|
|
issues.append("Invalid JSON format")
|
|
|
|
return {
|
|
"score": max(0, score),
|
|
"issues": issues
|
|
}
|
|
|
|
def _analyzeBehaviorResults(self, promptName: str, accumulatedContent: List[str]) -> Dict[str, Any]:
|
|
"""Analyze AI behavior results."""
|
|
totalContentLength = 0
|
|
iterations = len(accumulatedContent)
|
|
continuationInstructions = []
|
|
continuationQualities = []
|
|
|
|
for i, content in enumerate(accumulatedContent):
|
|
parsed = self._parseJsonResponse(content)
|
|
if parsed:
|
|
# Count content length in the response
|
|
contentLength = len(content)
|
|
totalContentLength += contentLength
|
|
|
|
continuation = parsed.get("continuation")
|
|
if continuation:
|
|
continuationInstructions.append(continuation)
|
|
quality = self._analyzeContinuationQuality(continuation)
|
|
continuationQualities.append(quality)
|
|
|
|
# Calculate averages
|
|
avgContinuationQuality = sum(q["score"] for q in continuationQualities) / len(continuationQualities) if continuationQualities else 0
|
|
|
|
return {
|
|
"promptName": promptName,
|
|
"iterations": iterations,
|
|
"totalContentLength": totalContentLength,
|
|
"continuationInstructions": continuationInstructions,
|
|
"avgContinuationQuality": avgContinuationQuality,
|
|
"success": totalContentLength > 0,
|
|
"efficiency": totalContentLength / iterations if iterations > 0 else 0
|
|
}
|
|
|
|
def _countPrimesInResponse(self, parsed: Dict[str, Any]) -> int:
|
|
"""Count prime numbers in the parsed response."""
|
|
count = 0
|
|
|
|
if "documents" in parsed:
|
|
for doc in parsed["documents"]:
|
|
if "sections" in doc:
|
|
for section in doc["sections"]:
|
|
if section.get("content_type") == "table" and "elements" in section:
|
|
for element in section["elements"]:
|
|
if "rows" in element:
|
|
for row in element["rows"]:
|
|
for cell in row:
|
|
if isinstance(cell, (str, int)) and str(cell).isdigit():
|
|
count += 1
|
|
|
|
return count
|
|
|
|
def printBehaviorResults(self):
|
|
"""Print AI behavior test results."""
|
|
print(f"\n{'='*80}")
|
|
print("AI BEHAVIOR TEST RESULTS")
|
|
print(f"{'='*80}")
|
|
|
|
for result in self.testResults:
|
|
print(f"\n{result['promptName']}:")
|
|
print(f" Iterations: {result['iterations']}")
|
|
print(f" Total Content Length: {result['totalContentLength']}")
|
|
print(f" Efficiency: {result['efficiency']:.1f} chars/iteration")
|
|
print(f" Avg Continuation Quality: {result['avgContinuationQuality']:.1f}/10")
|
|
print(f" Success: {'✅' if result['success'] else '❌'}")
|
|
|
|
if result['continuationInstructions']:
|
|
print(f" Continuation Instructions:")
|
|
for i, instruction in enumerate(result['continuationInstructions']):
|
|
print(f" {i+1}: {instruction[:80]}...")
|
|
|
|
# Find best performing prompt
|
|
if self.testResults:
|
|
bestEfficiency = max(self.testResults, key=lambda x: x['efficiency'])
|
|
bestQuality = max(self.testResults, key=lambda x: x['avgContinuationQuality'])
|
|
|
|
print(f"\n{'='*80}")
|
|
print("BEST PERFORMERS")
|
|
print(f"{'='*80}")
|
|
print(f"🏆 Best Efficiency: {bestEfficiency['promptName']} ({bestEfficiency['efficiency']:.1f} chars/iteration)")
|
|
print(f"🎯 Best Continuation Quality: {bestQuality['promptName']} ({bestQuality['avgContinuationQuality']:.1f}/10)")
|
|
|
|
# Test prompt scenarios for GENERIC continuation behavior
|
|
# These test different approaches to handle ANY user prompt and ANY data type
|
|
PROMPT_SCENARIOS = {
|
|
"Prime Numbers Test": """Generate the first 5000 prime numbers in a table with 10 columns per row.""",
|
|
|
|
"Text Content": """Generate a comprehensive guide about how to bring a new product to market in 10 sections, each containing detailed explanations and examples."""
|
|
}
|
|
|
|
async def main():
|
|
"""Run AI behavior testing."""
|
|
tester = AIBehaviorTester()
|
|
|
|
print("Starting AI Behavior Testing...")
|
|
print("Initializing AI service...")
|
|
await tester.initialize()
|
|
|
|
print(f"Testing {len(PROMPT_SCENARIOS)} different prompt scenarios")
|
|
|
|
for promptName, prompt in PROMPT_SCENARIOS.items():
|
|
try:
|
|
await tester.testPromptBehavior(promptName, prompt, maxIterations=2)
|
|
except Exception as e:
|
|
print(f"❌ Failed to test {promptName}: {str(e)}")
|
|
|
|
tester.printBehaviorResults()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|
|
|