gateway/test_web_integration.py
2025-10-03 01:41:50 +02:00

229 lines
9.5 KiB
Python

#!/usr/bin/env python3
"""
Test the integrated web research functionality.
This tests the complete workflow from methodAi.py through to the services.
"""
import asyncio
import logging
import sys
from pathlib import Path
# Add the gateway directory to the Python path
gateway_dir = Path(__file__).parent
sys.path.insert(0, str(gateway_dir))
# Import the required modules
from modules.services.serviceAi.mainServiceAi import AiService
from modules.datamodels.datamodelWeb import WebResearchRequest, WebResearchOptions
# Configure logging with more detailed output
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
# Set the main service logger to DEBUG level
# Configure logging for the main service and connectors
main_service_logger = logging.getLogger('modules.services.serviceAi.mainServiceAi')
main_service_logger.setLevel(logging.DEBUG)
tavily_logger = logging.getLogger('modules.connectors.connectorAiTavily')
tavily_logger.setLevel(logging.DEBUG)
interface_logger = logging.getLogger('modules.interfaces.interfaceAiObjects')
interface_logger.setLevel(logging.DEBUG)
async def test_web_research_integration():
"""Test the integrated web research functionality."""
try:
print("=" * 60)
print("WEB RESEARCH INTEGRATION TEST")
print("=" * 60)
# Config sanity check (non-verbose)
try:
from modules.shared.configuration import APP_CONFIG
env_type = APP_CONFIG.get('APP_ENV_TYPE')
print(f"Environment: {env_type}")
except Exception as e:
print(f"Configuration loading failed: {e}")
return
# Initialize the AI service
print("Initializing AI service...")
try:
ai_service = await asyncio.wait_for(AiService.create(), timeout=30.0)
print("AI service initialized")
except asyncio.TimeoutError:
print("AI service initialization timed out after 30 seconds")
return
except Exception as e:
print(f"AI service initialization failed: {e}")
import traceback
traceback.print_exc()
return
# Test 1: Basic web research
print("\n" + "="*60)
print("TEST 1: Basic Web Research")
print("="*60)
request = WebResearchRequest(
search_query="Kannst Du mir eine Liste machen, welche Grundstücke aktuell im Kanton Zürich verkauft werden?",
# search_query="Erstelle mir ein Firmenprofil von ValueOn AG in der Schweiz",
max_results=10,
options=WebResearchOptions(
max_pages=10,
search_depth="basic",
extract_depth="advanced",
format="markdown",
return_report=True,
pages_search_depth=2 # Test with depth 2: main pages + sub-pages
)
)
print("Starting web research call...")
try:
result = await ai_service.webResearch(request)
except Exception as e:
print(f"Web research failed: {e}")
import traceback
traceback.print_exc()
return
if result.success:
print("Web research completed successfully")
doc = result.documents[0].documentData
print(f"Websites analyzed: {doc.websites_analyzed}")
print(f"Additional links found: {doc.additional_links_found}")
# Show main URLs and sub-URLs for visibility
print("Main URLs (sources):")
for i, src in enumerate(doc.sources, 1):
try:
print(f" {i}. {src.title} - {src.url}")
except Exception:
print(f" {i}. {src}")
print("Sub-URLs (additional links):")
for i, link in enumerate(doc.additional_links, 1):
print(f" {i}. {link}")
print(f"Analysis result length: {len(doc.analysis_result)} characters")
# Save result to file
output_file = gateway_dir / "test_web_integration_result.md"
with open(output_file, 'w', encoding='utf-8') as f:
f.write("# Web Research Integration Test Result\n\n")
f.write(f"**Query:** {request.search_query}\n\n")
f.write(f"**Websites Analyzed:** {doc.websites_analyzed}\n")
f.write(f"**Additional Links Found:** {doc.additional_links_found}\n\n")
f.write("## Analysis Result\n\n")
f.write(doc.analysis_result)
f.write("\n\n## Sources\n\n")
for source in doc.sources:
try:
f.write(f"- [{source.title}]({source.url})\n")
except Exception:
f.write(f"- {source}\n")
f.write("\n## Additional Links\n\n")
for link in doc.additional_links:
f.write(f"- {link}\n")
print(f"Result saved to: {output_file}")
# Save individual website content files
print(f"\nSaving individual website content files...")
from datetime import datetime
import os
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
test_dir = f"test_web_content_{timestamp}"
os.makedirs(test_dir, exist_ok=True)
website_files = []
# Save main URLs
for i, src in enumerate(doc.sources, 1):
try:
url = src.url
title = src.title
except Exception:
url = str(src)
title = "Unknown"
# Create safe filename from URL
safe_filename = url.replace('https://', '').replace('http://', '').replace('/', '_').replace(':', '_')
safe_filename = ''.join(c for c in safe_filename if c.isalnum() or c in '._-')
safe_filename = safe_filename[:100] # Limit length
filename = f"{test_dir}/main_url_{i:03d}_{safe_filename}.txt"
with open(filename, 'w', encoding='utf-8') as f:
f.write(f"Title: {title}\n")
f.write(f"URL: {url}\n")
f.write(f"Type: Main URL\n")
f.write(f"Extracted: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write("="*80 + "\n\n")
# Write actual content if available
if hasattr(doc, 'individual_content') and doc.individual_content and url in doc.individual_content:
f.write(doc.individual_content[url])
else:
f.write("Content not available in individual_content field.\n")
f.write("The combined analysis is available in the main result file.\n")
website_files.append(filename)
print(f" Created: {filename}")
# Save additional links
for i, link in enumerate(doc.additional_links, 1):
# Create safe filename from URL
safe_filename = link.replace('https://', '').replace('http://', '').replace('/', '_').replace(':', '_')
safe_filename = ''.join(c for c in safe_filename if c.isalnum() or c in '._-')
safe_filename = safe_filename[:100] # Limit length
filename = f"{test_dir}/additional_link_{i:03d}_{safe_filename}.txt"
with open(filename, 'w', encoding='utf-8') as f:
f.write(f"URL: {link}\n")
f.write(f"Type: Additional Link\n")
f.write(f"Extracted: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write("="*80 + "\n\n")
# Write actual content if available
if hasattr(doc, 'individual_content') and doc.individual_content and link in doc.individual_content:
f.write(doc.individual_content[link])
else:
f.write("Content not available in individual_content field.\n")
f.write("The combined analysis is available in the main result file.\n")
website_files.append(filename)
print(f" Created: {filename}")
print(f"\nAll files saved to directory: {test_dir}")
print(f"Main result file: {output_file}")
print(f"Individual website files: {len(website_files)} files")
else:
print(f"Web research failed: {result.error}")
print("\nIntegration test completed!")
except Exception as e:
print(f"Integration test failed: {str(e)}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
# Add timeout to the entire test
try:
asyncio.run(asyncio.wait_for(test_web_research_integration(), timeout=600.0))
except asyncio.TimeoutError:
print("Test timed out after 600 seconds - likely hanging during processing")
except Exception as e:
print(f"Test failed with error: {e}")
import traceback
traceback.print_exc()