gateway/test_web_integration.py

#!/usr/bin/env python3
"""
Test the integrated web research functionality.
This tests the complete workflow from methodAi.py through to the services.
"""

import asyncio
import logging
import sys
from pathlib import Path

# Add the gateway directory to the Python path
gateway_dir = Path(__file__).parent
sys.path.insert(0, str(gateway_dir))

# Import the required modules
from modules.services.serviceAi.mainServiceAi import AiService
from modules.datamodels.datamodelWeb import WebResearchRequest, WebResearchOptions

# Configure logging with more detailed output
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

# Set the main service logger to DEBUG level
# Configure logging for the main service and connectors
main_service_logger = logging.getLogger('modules.services.serviceAi.mainServiceAi')
main_service_logger.setLevel(logging.DEBUG)

tavily_logger = logging.getLogger('modules.connectors.connectorAiTavily')
tavily_logger.setLevel(logging.DEBUG)

interface_logger = logging.getLogger('modules.interfaces.interfaceAiObjects')
interface_logger.setLevel(logging.DEBUG)

async def test_web_research_integration():
    """Test the integrated web research functionality."""
    try:
        print("=" * 60)
        print("WEB RESEARCH INTEGRATION TEST")
        print("=" * 60)

        # Config sanity check (non-verbose)
        try:
            from modules.shared.configuration import APP_CONFIG
            env_type = APP_CONFIG.get('APP_ENV_TYPE')
            print(f"Environment: {env_type}")
        except Exception as e:
            print(f"Configuration loading failed: {e}")
            return

        # Initialize the AI service
        print("Initializing AI service...")
        try:
            ai_service = await asyncio.wait_for(AiService.create(), timeout=30.0)
            print("AI service initialized")
        except asyncio.TimeoutError:
            print("AI service initialization timed out after 30 seconds")
            return
        except Exception as e:
            print(f"AI service initialization failed: {e}")
            import traceback
            traceback.print_exc()
            return

        # Test 1: Basic web research
        print("\n" + "="*60)
        print("TEST 1: Basic Web Research")
        print("="*60)

        request = WebResearchRequest(
            # search_query="Kannst Du mir eine Liste machen, welche Grundstücke aktuell im Kanton Zürich verkauft werden?",
            search_query="Erstelle mir ein Firmenprofil von ValueOn AG in der Schweiz",
            max_results=10,
            options=WebResearchOptions(
                max_pages=10,
                search_depth="basic",
                extract_depth="advanced",
                format="markdown",
                return_report=True,
                pages_search_depth=2  # Test with depth 2: main pages + sub-pages
            )
        )

        print("Starting web research call...")
        try:
            result = await ai_service.webResearch(request)
        except Exception as e:
            print(f"Web research failed: {e}")
            import traceback
            traceback.print_exc()
            return

        if result.success:
            print("Web research completed successfully")
            doc = result.documents[0].documentData
            print(f"Websites analyzed: {doc.websites_analyzed}")
            print(f"Additional links found: {doc.additional_links_found}")
            # Show main URLs and sub-URLs for visibility
            print("Main URLs (sources):")
            for i, src in enumerate(doc.sources, 1):
                try:
                    print(f"  {i}. {src.title} - {src.url}")
                except Exception:
                    print(f"  {i}. {src}")
            print("Sub-URLs (additional links):")
            for i, link in enumerate(doc.additional_links, 1):
                print(f"  {i}. {link}")
            print(f"Analysis result length: {len(doc.analysis_result)} characters")

            # Save result to file
            output_file = gateway_dir / "test_web_integration_result.md"
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write("# Web Research Integration Test Result\n\n")
                f.write(f"**Query:** {request.search_query}\n\n")
                f.write(f"**Websites Analyzed:** {doc.websites_analyzed}\n")
                f.write(f"**Additional Links Found:** {doc.additional_links_found}\n\n")
                f.write("## Analysis Result\n\n")
                f.write(doc.analysis_result)
                f.write("\n\n## Sources\n\n")
                for source in doc.sources:
                    try:
                        f.write(f"- [{source.title}]({source.url})\n")
                    except Exception:
                        f.write(f"- {source}\n")
                f.write("\n## Additional Links\n\n")
                for link in doc.additional_links:
                    f.write(f"- {link}\n")

            print(f"Result saved to: {output_file}")

            # Save individual website content files
            print(f"\nSaving individual website content files...")
            from datetime import datetime
            import os

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            test_dir = f"test_web_content_{timestamp}"
            os.makedirs(test_dir, exist_ok=True)

            website_files = []

            # Save main URLs
            for i, src in enumerate(doc.sources, 1):
                try:
                    url = src.url
                    title = src.title
                except Exception:
                    url = str(src)
                    title = "Unknown"

                # Create safe filename from URL
                safe_filename = url.replace('https://', '').replace('http://', '').replace('/', '_').replace(':', '_')
                safe_filename = ''.join(c for c in safe_filename if c.isalnum() or c in '._-')
                safe_filename = safe_filename[:100]  # Limit length

                filename = f"{test_dir}/main_url_{i:03d}_{safe_filename}.txt"

                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(f"Title: {title}\n")
                    f.write(f"URL: {url}\n")
                    f.write(f"Type: Main URL\n")
                    f.write(f"Extracted: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                    f.write("="*80 + "\n\n")

                    # Write actual content if available
                    if hasattr(doc, 'individual_content') and doc.individual_content and url in doc.individual_content:
                        f.write(doc.individual_content[url])
                    else:
                        f.write("Content not available in individual_content field.\n")
                        f.write("The combined analysis is available in the main result file.\n")

                website_files.append(filename)
                print(f"  Created: {filename}")

            # Save additional links
            for i, link in enumerate(doc.additional_links, 1):
                # Create safe filename from URL
                safe_filename = link.replace('https://', '').replace('http://', '').replace('/', '_').replace(':', '_')
                safe_filename = ''.join(c for c in safe_filename if c.isalnum() or c in '._-')
                safe_filename = safe_filename[:100]  # Limit length

                filename = f"{test_dir}/additional_link_{i:03d}_{safe_filename}.txt"

                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(f"URL: {link}\n")
                    f.write(f"Type: Additional Link\n")
                    f.write(f"Extracted: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                    f.write("="*80 + "\n\n")

                    # Write actual content if available
                    if hasattr(doc, 'individual_content') and doc.individual_content and link in doc.individual_content:
                        f.write(doc.individual_content[link])
                    else:
                        f.write("Content not available in individual_content field.\n")
                        f.write("The combined analysis is available in the main result file.\n")

                website_files.append(filename)
                print(f"  Created: {filename}")

            print(f"\nAll files saved to directory: {test_dir}")
            print(f"Main result file: {output_file}")
            print(f"Individual website files: {len(website_files)} files")

        else:
            print(f"Web research failed: {result.error}")

        print("\nIntegration test completed!")

    except Exception as e:
        print(f"Integration test failed: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    # Add timeout to the entire test
    try:
        asyncio.run(asyncio.wait_for(test_web_research_integration(), timeout=600.0))
    except asyncio.TimeoutError:
        print("Test timed out after 600 seconds - likely hanging during processing")
    except Exception as e:
        print(f"Test failed with error: {e}")
        import traceback
        traceback.print_exc()