gateway/modules/services/serviceAi/subWebResearch.py

import logging
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.aicore.aicorePluginTavily import WebResearchRequest, WebResearchResult
from modules.interfaces.interfaceAiObjects import AiObjects
from modules.shared.configuration import APP_CONFIG

logger = logging.getLogger(__name__)


class SubWebResearch:
    """Web research operations including search, crawling, and analysis."""

    def __init__(self, services, aiObjects):
        """Initialize web research service.

        Args:
            services: Service center instance for accessing other services
            aiObjects: Initialized AiObjects instance
        """
        self.services = services
        self.aiObjects = aiObjects

    async def webResearch(self, request: WebResearchRequest) -> WebResearchResult:
        """Perform web research using interface functions."""
        try:
            logger.info(f"WEB RESEARCH STARTED")
            logger.info(f"User Query: {request.user_prompt}")
            logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}")

            # Global URL index to track all processed URLs across the entire research session
            global_processed_urls = set()

            # Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
            logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")

            if request.urls:
                # Use provided URLs as initial main URLs
                websites = request.urls
                logger.info(f"Using provided URLs ({len(websites)}):")
                for i, url in enumerate(websites, 1):
                    logger.info(f"   {i}. {url}")
            else:
                # Use AI to determine main URLs based on user's intention
                logger.info(f"AI analyzing user intent: '{request.user_prompt}'")

                # Use AI to generate optimized Tavily search query and search parameters
                query_optimizer_prompt = f"""You are a search query optimizer.

        USER QUERY: {request.user_prompt}

        Your task: Create a search query and parameters for the USER QUERY given.

        RULES:
        1. The search query MUST be related to the user query above
        2. Extract key terms from the user query
        3. Determine appropriate country/language based on the query context
        4. Keep search query short (2-6 words)

        Return ONLY this JSON format:
        {{
        "user_prompt": "search query based on user query above",
        "country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
        "language": "language_code_or_null",
        "topic": "general|news|academic_or_null",
        "time_range": "d|w|m|y_or_null",
        "selection_strategy": "single|multiple|specific_page",
        "selection_criteria": "what URLs to prioritize",
        "expected_url_patterns": ["pattern1", "pattern2"],
        "estimated_result_count": number
        }}"""

                # Get AI response for query optimization
                from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
                ai_request = AiCallRequest(
                    prompt=query_optimizer_prompt,
                    options=AiCallOptions()
                )

                # Write web research query optimization prompt to debug file
                self.services.utils.writeDebugFile(query_optimizer_prompt, "web_research_query_optimizer_prompt")

                ai_response_obj = await self.aiObjects.call(ai_request)
                ai_response = ai_response_obj.content

                # Write web research query optimization response to debug file
                self.services.utils.writeDebugFile(ai_response, "web_research_query_optimizer_response")
                logger.debug(f"AI query optimizer response: {ai_response}")

                # Parse AI response to extract search query
                import json
                try:
                    # Clean the response by removing markdown code blocks
                    cleaned_response = ai_response.strip()
                    if cleaned_response.startswith('```json'):
                        cleaned_response = cleaned_response[7:]  # Remove ```json
                    if cleaned_response.endswith('```'):
                        cleaned_response = cleaned_response[:-3]  # Remove ```
                    cleaned_response = cleaned_response.strip()

                    query_data = json.loads(cleaned_response)
                    search_query = query_data.get("user_prompt", request.user_prompt)
                    ai_country = query_data.get("country")
                    ai_language = query_data.get("language")
                    ai_topic = query_data.get("topic")
                    ai_time_range = query_data.get("time_range")
                    selection_strategy = query_data.get("selection_strategy", "multiple")
                    selection_criteria = query_data.get("selection_criteria", "relevant URLs")
                    expected_patterns = query_data.get("expected_url_patterns", [])
                    estimated_count = query_data.get("estimated_result_count", request.max_results)

                    logger.info(f"AI optimized search query: '{search_query}'")
                    logger.info(f"Selection strategy: {selection_strategy}")
                    logger.info(f"Selection criteria: {selection_criteria}")
                    logger.info(f"Expected URL patterns: {expected_patterns}")
                    logger.info(f"Estimated result count: {estimated_count}")

                except json.JSONDecodeError:
                    logger.warning("Failed to parse AI response as JSON, using original query")
                    search_query = request.user_prompt
                    ai_country = None
                    ai_language = None
                    ai_topic = None
                    ai_time_range = None
                    selection_strategy = "multiple"

                # Perform the web search with AI-determined parameters
                search_kwargs = {
                    "query": search_query,
                    "max_results": request.max_results,
                    "search_depth": request.options.search_depth,
                    "auto_parameters": False  # Use explicit parameters
                }

                # Add parameters only if they have valid values
                def _normalizeCountry(c: Optional[str]) -> Optional[str]:
                    if not c:
                        return None
                    s = str(c).strip()
                    if not s or s.lower() in ['null', 'none', 'undefined']:
                        return None
                    # Map common codes to full English names when easy to do without extra deps
                    mapping = {
                        'ch': 'Switzerland', 'che': 'Switzerland',
                        'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany',
                        'at': 'Austria', 'aut': 'Austria',
                        'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States',
                        'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom'
                    }
                    key = s.lower()
                    if key in mapping:
                        return mapping[key]
                    # If looks like full name, capitalize first letter only (Tavily accepts English names)
                    return s

                norm_ai_country = _normalizeCountry(ai_country)
                norm_req_country = _normalizeCountry(request.options.country)
                if norm_ai_country:
                    search_kwargs["country"] = norm_ai_country
                elif norm_req_country:
                    search_kwargs["country"] = norm_req_country

                if ai_language and ai_language not in ['null', '', 'none', 'undefined']:
                    search_kwargs["language"] = ai_language
                elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']:
                    search_kwargs["language"] = request.options.language

                if ai_topic and ai_topic in ['general', 'news', 'academic']:
                    search_kwargs["topic"] = ai_topic
                elif request.options.topic and request.options.topic in ['general', 'news', 'academic']:
                    search_kwargs["topic"] = request.options.topic

                if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']:
                    search_kwargs["time_range"] = ai_time_range
                elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']:
                    search_kwargs["time_range"] = request.options.time_range

                # Constrain by expected domains if provided by AI
                try:
                    include_domains = []
                    for p in expected_patterns or []:
                        if not isinstance(p, str):
                            continue
                        # Extract bare domain from pattern or URL
                        import re
                        m = re.search(r"(?:https?://)?([^/\s]+)", p.strip())
                        if m:
                            domain = m.group(1).lower()
                            # strip leading www.
                            if domain.startswith('www.'):
                                domain = domain[4:]
                            include_domains.append(domain)
                    # Deduplicate
                    if include_domains:
                        seen = set()
                        uniq = []
                        for d in include_domains:
                            if d not in seen:
                                seen.add(d)
                                uniq.append(d)
                        search_kwargs["include_domains"] = uniq
                except Exception:
                    pass

                # Log the parameters being used
                logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}")

                search_results = await self.aiObjects.search_websites(**search_kwargs)

                logger.debug(f"Web search returned {len(search_results)} results:")
                for i, result in enumerate(search_results, 1):
                    logger.debug(f"   {i}. {result.url} - {result.title}")

                # Deduplicate while preserving order
                seen = set()
                search_urls = []
                for r in search_results:
                    u = str(r.url)
                    if u not in seen:
                        seen.add(u)
                        search_urls.append(u)

                logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results")

                if not search_urls:
                    logger.error("No relevant websites found")
                    return WebResearchResult(success=False, error="No relevant websites found")

                # Now use AI to determine the main URLs based on user's intention
                logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent")

                # Create a prompt for AI to identify main URLs based on user's intention
                ai_prompt = f"""
        Select the most relevant URLs from these search results:

        {chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])}

        Return only the URLs that are most relevant for the user's query.
        One URL per line.
        """
                # Create AI call request
                ai_request = AiCallRequest(
                    prompt=ai_prompt,
                    options=AiCallOptions()
                )

                # Write web research URL selection prompt to debug file
                self.services.utils.writeDebugFile(ai_prompt, "web_research_url_selection_prompt")

                ai_response_obj = await self.aiObjects.call(ai_request)
                ai_response = ai_response_obj.content

                # Write web research URL selection response to debug file
                self.services.utils.writeDebugFile(ai_response, "web_research_url_selection_response")
                logger.debug(f"AI response for main URL selection: {ai_response}")

                # Parse AI response to extract URLs
                websites = []
                for line in ai_response.strip().split('\n'):
                    line = line.strip()
                    if line and ('http://' in line or 'https://' in line):
                        # Extract URL from the line
                        for word in line.split():
                            if word.startswith('http://') or word.startswith('https://'):
                                websites.append(word.rstrip('.,;'))
                                break

                if not websites:
                    logger.warning("AI did not identify any main URLs, using first few search results")
                    websites = search_urls[:3]  # Fallback to first 3 search results

                # Deduplicate while preserving order
                seen = set()
                unique_websites = []
                for url in websites:
                    if url not in seen:
                        seen.add(url)
                        unique_websites.append(url)

                websites = unique_websites
                logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs")

                logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
                for i, url in enumerate(websites, 1):
                    logger.info(f"   {i}. {url}")

            # Step 2: Smart website selection using AI interface
            logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
            logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")

            selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt)

            logger.debug(f"AI Response: {aiResponse}")
            logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:")
            for i, url in enumerate(selectedWebsites, 1):
                logger.debug(f"   {i}. {url}")

            # Show which were filtered out
            filtered_out = [url for url in websites if url not in selectedWebsites]
            if filtered_out:
                logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:")
                for i, url in enumerate(filtered_out, 1):
                    logger.debug(f"   {i}. {url}")

            # Step 3+4+5: Recursive crawling with configurable depth
            # Get configuration parameters
            max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
            max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
            crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
            crawl_timeout_seconds = crawl_timeout_minutes * 60

            # Use the configured max_depth or the request's pages_search_depth, whichever is smaller
            effective_depth = min(max_depth, request.options.pages_search_depth)

            logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===")
            logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...")
            logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})")
            logger.info(f"Max links per domain: {max_links_per_domain}")
            logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")

            # Use recursive crawling with URL index to avoid duplicates
            import asyncio
            try:
                allContent = await asyncio.wait_for(
                    self.aiObjects.crawlRecursively(
                        urls=selectedWebsites,
                        max_depth=effective_depth,
                        extract_depth=request.options.extract_depth,
                        max_per_domain=max_links_per_domain,
                        global_processed_urls=global_processed_urls
                    ),
                    timeout=crawl_timeout_seconds
                )
                logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled")
            except asyncio.TimeoutError:
                logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results")
                # crawlRecursively now handles timeouts gracefully and returns partial results
                # Try to get the partial results that were collected
                allContent = {}

            if not allContent:
                logger.error("Could not extract content from any websites")
                return WebResearchResult(success=False, error="Could not extract content from any websites")

            logger.info(f"=== WEB RESEARCH COMPLETED ===")
            logger.info(f"Successfully crawled {len(allContent)} URLs total")
            logger.info(f"Crawl depth: {effective_depth} levels")

            # Create simple result with raw content
            sources = [{"title": url, "url": url} for url in selectedWebsites]

            # Get all additional links (all URLs except main ones)
            additional_links = [url for url in allContent.keys() if url not in selectedWebsites]

            # Combine all content into a single result
            combinedContent = ""
            for url, content in allContent.items():
                combinedContent += f"\n\n=== {url} ===\n{content}\n"

            # Create simplified document structure
            document = {
                "documentName": f"webResearch_{request.user_prompt[:50]}.json",
                "documentData": {
                    "user_prompt": request.user_prompt,
                    "analysis_result": combinedContent,
                    "sources": sources,
                    "additional_links": additional_links,
                    "metadata": {
                        "websites_analyzed": len(allContent),
                        "additional_links_found": len(additional_links),
                        "crawl_depth": effective_depth,
                        "max_configured_depth": max_depth,
                        "max_links_per_domain": max_links_per_domain,
                        "crawl_timeout_minutes": crawl_timeout_minutes,
                        "total_urls_crawled": len(allContent),
                        "main_urls": len(selectedWebsites),
                        "additional_urls": len(additional_links)
                    }
                },
                "mimeType": "application/json"
            }

            return WebResearchResult(
                success=True,
                documents=[document]
            )

        except Exception as e:
            logger.error(f"Error in web research: {str(e)}")
            return WebResearchResult(success=False, error=str(e))