229 lines
9.5 KiB
Python
229 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test the integrated web research functionality.
|
|
This tests the complete workflow from methodAi.py through to the services.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add the gateway directory to the Python path
|
|
gateway_dir = Path(__file__).parent
|
|
sys.path.insert(0, str(gateway_dir))
|
|
|
|
# Import the required modules
|
|
from modules.services.serviceAi.mainServiceAi import AiService
|
|
from modules.datamodels.datamodelWeb import WebResearchRequest, WebResearchOptions
|
|
|
|
# Configure logging with more detailed output
|
|
logging.basicConfig(
|
|
level=logging.DEBUG,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout)
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Set the main service logger to DEBUG level
|
|
# Configure logging for the main service and connectors
|
|
main_service_logger = logging.getLogger('modules.services.serviceAi.mainServiceAi')
|
|
main_service_logger.setLevel(logging.DEBUG)
|
|
|
|
tavily_logger = logging.getLogger('modules.connectors.connectorAiTavily')
|
|
tavily_logger.setLevel(logging.DEBUG)
|
|
|
|
interface_logger = logging.getLogger('modules.interfaces.interfaceAiObjects')
|
|
interface_logger.setLevel(logging.DEBUG)
|
|
|
|
async def test_web_research_integration():
|
|
"""Test the integrated web research functionality."""
|
|
try:
|
|
print("=" * 60)
|
|
print("WEB RESEARCH INTEGRATION TEST")
|
|
print("=" * 60)
|
|
|
|
# Config sanity check (non-verbose)
|
|
try:
|
|
from modules.shared.configuration import APP_CONFIG
|
|
env_type = APP_CONFIG.get('APP_ENV_TYPE')
|
|
print(f"Environment: {env_type}")
|
|
except Exception as e:
|
|
print(f"Configuration loading failed: {e}")
|
|
return
|
|
|
|
# Initialize the AI service
|
|
print("Initializing AI service...")
|
|
try:
|
|
ai_service = await asyncio.wait_for(AiService.create(), timeout=30.0)
|
|
print("AI service initialized")
|
|
except asyncio.TimeoutError:
|
|
print("AI service initialization timed out after 30 seconds")
|
|
return
|
|
except Exception as e:
|
|
print(f"AI service initialization failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return
|
|
|
|
# Test 1: Basic web research
|
|
print("\n" + "="*60)
|
|
print("TEST 1: Basic Web Research")
|
|
print("="*60)
|
|
|
|
request = WebResearchRequest(
|
|
# search_query="Kannst Du mir eine Liste machen, welche Grundstücke aktuell im Kanton Zürich verkauft werden?",
|
|
search_query="Erstelle mir ein Firmenprofil von ValueOn AG in der Schweiz",
|
|
max_results=10,
|
|
options=WebResearchOptions(
|
|
max_pages=10,
|
|
search_depth="basic",
|
|
extract_depth="advanced",
|
|
format="markdown",
|
|
return_report=True,
|
|
pages_search_depth=2 # Test with depth 2: main pages + sub-pages
|
|
)
|
|
)
|
|
|
|
print("Starting web research call...")
|
|
try:
|
|
result = await ai_service.webResearch(request)
|
|
except Exception as e:
|
|
print(f"Web research failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return
|
|
|
|
if result.success:
|
|
print("Web research completed successfully")
|
|
doc = result.documents[0].documentData
|
|
print(f"Websites analyzed: {doc.websites_analyzed}")
|
|
print(f"Additional links found: {doc.additional_links_found}")
|
|
# Show main URLs and sub-URLs for visibility
|
|
print("Main URLs (sources):")
|
|
for i, src in enumerate(doc.sources, 1):
|
|
try:
|
|
print(f" {i}. {src.title} - {src.url}")
|
|
except Exception:
|
|
print(f" {i}. {src}")
|
|
print("Sub-URLs (additional links):")
|
|
for i, link in enumerate(doc.additional_links, 1):
|
|
print(f" {i}. {link}")
|
|
print(f"Analysis result length: {len(doc.analysis_result)} characters")
|
|
|
|
# Save result to file
|
|
output_file = gateway_dir / "test_web_integration_result.md"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("# Web Research Integration Test Result\n\n")
|
|
f.write(f"**Query:** {request.search_query}\n\n")
|
|
f.write(f"**Websites Analyzed:** {doc.websites_analyzed}\n")
|
|
f.write(f"**Additional Links Found:** {doc.additional_links_found}\n\n")
|
|
f.write("## Analysis Result\n\n")
|
|
f.write(doc.analysis_result)
|
|
f.write("\n\n## Sources\n\n")
|
|
for source in doc.sources:
|
|
try:
|
|
f.write(f"- [{source.title}]({source.url})\n")
|
|
except Exception:
|
|
f.write(f"- {source}\n")
|
|
f.write("\n## Additional Links\n\n")
|
|
for link in doc.additional_links:
|
|
f.write(f"- {link}\n")
|
|
|
|
print(f"Result saved to: {output_file}")
|
|
|
|
# Save individual website content files
|
|
print(f"\nSaving individual website content files...")
|
|
from datetime import datetime
|
|
import os
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
test_dir = f"test_web_content_{timestamp}"
|
|
os.makedirs(test_dir, exist_ok=True)
|
|
|
|
website_files = []
|
|
|
|
# Save main URLs
|
|
for i, src in enumerate(doc.sources, 1):
|
|
try:
|
|
url = src.url
|
|
title = src.title
|
|
except Exception:
|
|
url = str(src)
|
|
title = "Unknown"
|
|
|
|
# Create safe filename from URL
|
|
safe_filename = url.replace('https://', '').replace('http://', '').replace('/', '_').replace(':', '_')
|
|
safe_filename = ''.join(c for c in safe_filename if c.isalnum() or c in '._-')
|
|
safe_filename = safe_filename[:100] # Limit length
|
|
|
|
filename = f"{test_dir}/main_url_{i:03d}_{safe_filename}.txt"
|
|
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|
f.write(f"Title: {title}\n")
|
|
f.write(f"URL: {url}\n")
|
|
f.write(f"Type: Main URL\n")
|
|
f.write(f"Extracted: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
f.write("="*80 + "\n\n")
|
|
|
|
# Write actual content if available
|
|
if hasattr(doc, 'individual_content') and doc.individual_content and url in doc.individual_content:
|
|
f.write(doc.individual_content[url])
|
|
else:
|
|
f.write("Content not available in individual_content field.\n")
|
|
f.write("The combined analysis is available in the main result file.\n")
|
|
|
|
website_files.append(filename)
|
|
print(f" Created: {filename}")
|
|
|
|
# Save additional links
|
|
for i, link in enumerate(doc.additional_links, 1):
|
|
# Create safe filename from URL
|
|
safe_filename = link.replace('https://', '').replace('http://', '').replace('/', '_').replace(':', '_')
|
|
safe_filename = ''.join(c for c in safe_filename if c.isalnum() or c in '._-')
|
|
safe_filename = safe_filename[:100] # Limit length
|
|
|
|
filename = f"{test_dir}/additional_link_{i:03d}_{safe_filename}.txt"
|
|
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|
f.write(f"URL: {link}\n")
|
|
f.write(f"Type: Additional Link\n")
|
|
f.write(f"Extracted: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
f.write("="*80 + "\n\n")
|
|
|
|
# Write actual content if available
|
|
if hasattr(doc, 'individual_content') and doc.individual_content and link in doc.individual_content:
|
|
f.write(doc.individual_content[link])
|
|
else:
|
|
f.write("Content not available in individual_content field.\n")
|
|
f.write("The combined analysis is available in the main result file.\n")
|
|
|
|
website_files.append(filename)
|
|
print(f" Created: {filename}")
|
|
|
|
print(f"\nAll files saved to directory: {test_dir}")
|
|
print(f"Main result file: {output_file}")
|
|
print(f"Individual website files: {len(website_files)} files")
|
|
|
|
else:
|
|
print(f"Web research failed: {result.error}")
|
|
|
|
print("\nIntegration test completed!")
|
|
|
|
except Exception as e:
|
|
print(f"Integration test failed: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
if __name__ == "__main__":
|
|
# Add timeout to the entire test
|
|
try:
|
|
asyncio.run(asyncio.wait_for(test_web_research_integration(), timeout=600.0))
|
|
except asyncio.TimeoutError:
|
|
print("Test timed out after 600 seconds - likely hanging during processing")
|
|
except Exception as e:
|
|
print(f"Test failed with error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|