import logging from typing import Dict, Any, List, Optional, Tuple, Union from modules.datamodels.datamodelWeb import ( WebResearchRequest, WebResearchActionResult, WebResearchDocumentData, WebResearchActionDocument, WebSearchResultItem, ) from modules.interfaces.interfaceAiObjects import AiObjects from modules.shared.configuration import APP_CONFIG logger = logging.getLogger(__name__) class SubWebResearch: """Web research operations including search, crawling, and analysis.""" def __init__(self, services, aiObjects): """Initialize web research service. Args: services: Service center instance for accessing other services aiObjects: Initialized AiObjects instance """ self.services = services self.aiObjects = aiObjects async def webResearch(self, request: WebResearchRequest) -> WebResearchActionResult: """Perform web research using interface functions.""" try: logger.info(f"WEB RESEARCH STARTED") logger.info(f"User Query: {request.user_prompt}") logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}") # Global URL index to track all processed URLs across the entire research session global_processed_urls = set() # Step 1: Find relevant websites - either provided URLs or AI-determined main URLs logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===") if request.urls: # Use provided URLs as initial main URLs websites = request.urls logger.info(f"Using provided URLs ({len(websites)}):") for i, url in enumerate(websites, 1): logger.info(f" {i}. {url}") else: # Use AI to determine main URLs based on user's intention logger.info(f"AI analyzing user intent: '{request.user_prompt}'") # Use AI to generate optimized Tavily search query and search parameters query_optimizer_prompt = f"""You are a search query optimizer. USER QUERY: {request.user_prompt} Your task: Create a search query and parameters for the USER QUERY given. RULES: 1. The search query MUST be related to the user query above 2. Extract key terms from the user query 3. Determine appropriate country/language based on the query context 4. Keep search query short (2-6 words) Return ONLY this JSON format: {{ "user_prompt": "search query based on user query above", "country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)", "language": "language_code_or_null", "topic": "general|news|academic_or_null", "time_range": "d|w|m|y_or_null", "selection_strategy": "single|multiple|specific_page", "selection_criteria": "what URLs to prioritize", "expected_url_patterns": ["pattern1", "pattern2"], "estimated_result_count": number }}""" # Get AI response for query optimization from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions ai_request = AiCallRequest( prompt=query_optimizer_prompt, options=AiCallOptions() ) ai_response_obj = await self.aiObjects.call(ai_request) ai_response = ai_response_obj.content logger.debug(f"AI query optimizer response: {ai_response}") # Parse AI response to extract search query import json try: # Clean the response by removing markdown code blocks cleaned_response = ai_response.strip() if cleaned_response.startswith('```json'): cleaned_response = cleaned_response[7:] # Remove ```json if cleaned_response.endswith('```'): cleaned_response = cleaned_response[:-3] # Remove ``` cleaned_response = cleaned_response.strip() query_data = json.loads(cleaned_response) search_query = query_data.get("user_prompt", request.user_prompt) ai_country = query_data.get("country") ai_language = query_data.get("language") ai_topic = query_data.get("topic") ai_time_range = query_data.get("time_range") selection_strategy = query_data.get("selection_strategy", "multiple") selection_criteria = query_data.get("selection_criteria", "relevant URLs") expected_patterns = query_data.get("expected_url_patterns", []) estimated_count = query_data.get("estimated_result_count", request.max_results) logger.info(f"AI optimized search query: '{search_query}'") logger.info(f"Selection strategy: {selection_strategy}") logger.info(f"Selection criteria: {selection_criteria}") logger.info(f"Expected URL patterns: {expected_patterns}") logger.info(f"Estimated result count: {estimated_count}") except json.JSONDecodeError: logger.warning("Failed to parse AI response as JSON, using original query") search_query = request.user_prompt ai_country = None ai_language = None ai_topic = None ai_time_range = None selection_strategy = "multiple" # Perform the web search with AI-determined parameters search_kwargs = { "query": search_query, "max_results": request.max_results, "search_depth": request.options.search_depth, "auto_parameters": False # Use explicit parameters } # Add parameters only if they have valid values def _normalizeCountry(c: Optional[str]) -> Optional[str]: if not c: return None s = str(c).strip() if not s or s.lower() in ['null', 'none', 'undefined']: return None # Map common codes to full English names when easy to do without extra deps mapping = { 'ch': 'Switzerland', 'che': 'Switzerland', 'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany', 'at': 'Austria', 'aut': 'Austria', 'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States', 'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom' } key = s.lower() if key in mapping: return mapping[key] # If looks like full name, capitalize first letter only (Tavily accepts English names) return s norm_ai_country = _normalizeCountry(ai_country) norm_req_country = _normalizeCountry(request.options.country) if norm_ai_country: search_kwargs["country"] = norm_ai_country elif norm_req_country: search_kwargs["country"] = norm_req_country if ai_language and ai_language not in ['null', '', 'none', 'undefined']: search_kwargs["language"] = ai_language elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']: search_kwargs["language"] = request.options.language if ai_topic and ai_topic in ['general', 'news', 'academic']: search_kwargs["topic"] = ai_topic elif request.options.topic and request.options.topic in ['general', 'news', 'academic']: search_kwargs["topic"] = request.options.topic if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']: search_kwargs["time_range"] = ai_time_range elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']: search_kwargs["time_range"] = request.options.time_range # Constrain by expected domains if provided by AI try: include_domains = [] for p in expected_patterns or []: if not isinstance(p, str): continue # Extract bare domain from pattern or URL import re m = re.search(r"(?:https?://)?([^/\s]+)", p.strip()) if m: domain = m.group(1).lower() # strip leading www. if domain.startswith('www.'): domain = domain[4:] include_domains.append(domain) # Deduplicate if include_domains: seen = set() uniq = [] for d in include_domains: if d not in seen: seen.add(d) uniq.append(d) search_kwargs["include_domains"] = uniq except Exception: pass # Log the parameters being used logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}") search_results = await self.aiObjects.search_websites(**search_kwargs) logger.debug(f"Web search returned {len(search_results)} results:") for i, result in enumerate(search_results, 1): logger.debug(f" {i}. {result.url} - {result.title}") # Deduplicate while preserving order seen = set() search_urls = [] for r in search_results: u = str(r.url) if u not in seen: seen.add(u) search_urls.append(u) logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results") if not search_urls: logger.error("No relevant websites found") return WebResearchActionResult(success=False, error="No relevant websites found") # Now use AI to determine the main URLs based on user's intention logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent") # Create a prompt for AI to identify main URLs based on user's intention ai_prompt = f""" Select the most relevant URLs from these search results: {chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])} Return only the URLs that are most relevant for the user's query. One URL per line. """ # Create AI call request ai_request = AiCallRequest( prompt=ai_prompt, options=AiCallOptions() ) ai_response_obj = await self.aiObjects.call(ai_request) ai_response = ai_response_obj.content logger.debug(f"AI response for main URL selection: {ai_response}") # Parse AI response to extract URLs websites = [] for line in ai_response.strip().split('\n'): line = line.strip() if line and ('http://' in line or 'https://' in line): # Extract URL from the line for word in line.split(): if word.startswith('http://') or word.startswith('https://'): websites.append(word.rstrip('.,;')) break if not websites: logger.warning("AI did not identify any main URLs, using first few search results") websites = search_urls[:3] # Fallback to first 3 search results # Deduplicate while preserving order seen = set() unique_websites = [] for url in websites: if url not in seen: seen.add(url) unique_websites.append(url) websites = unique_websites logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs") logger.info(f"AI selected {len(websites)} main URLs (after deduplication):") for i, url in enumerate(websites, 1): logger.info(f" {i}. {url}") # Step 2: Smart website selection using AI interface logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===") logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'") selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt) logger.debug(f"AI Response: {aiResponse}") logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:") for i, url in enumerate(selectedWebsites, 1): logger.debug(f" {i}. {url}") # Show which were filtered out filtered_out = [url for url in websites if url not in selectedWebsites] if filtered_out: logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:") for i, url in enumerate(filtered_out, 1): logger.debug(f" {i}. {url}") # Step 3+4+5: Recursive crawling with configurable depth # Get configuration parameters max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2")) max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4")) crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10")) crawl_timeout_seconds = crawl_timeout_minutes * 60 # Use the configured max_depth or the request's pages_search_depth, whichever is smaller effective_depth = min(max_depth, request.options.pages_search_depth) logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===") logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...") logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})") logger.info(f"Max links per domain: {max_links_per_domain}") logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes") # Use recursive crawling with URL index to avoid duplicates import asyncio try: allContent = await asyncio.wait_for( self.aiObjects.crawlRecursively( urls=selectedWebsites, max_depth=effective_depth, extract_depth=request.options.extract_depth, max_per_domain=max_links_per_domain, global_processed_urls=global_processed_urls ), timeout=crawl_timeout_seconds ) logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled") except asyncio.TimeoutError: logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results") # crawlRecursively now handles timeouts gracefully and returns partial results # Try to get the partial results that were collected allContent = {} if not allContent: logger.error("Could not extract content from any websites") return WebResearchActionResult(success=False, error="Could not extract content from any websites") logger.info(f"=== WEB RESEARCH COMPLETED ===") logger.info(f"Successfully crawled {len(allContent)} URLs total") logger.info(f"Crawl depth: {effective_depth} levels") # Create simple result with raw content sources = [WebSearchResultItem(title=url, url=url) for url in selectedWebsites] # Get all additional links (all URLs except main ones) additional_links = [url for url in allContent.keys() if url not in selectedWebsites] # Combine all content into a single result combinedContent = "" for url, content in allContent.items(): combinedContent += f"\n\n=== {url} ===\n{content}\n" documentData = WebResearchDocumentData( user_prompt=request.user_prompt, websites_analyzed=len(allContent), additional_links_found=len(additional_links), analysis_result=combinedContent, # Raw content, no analysis sources=sources, additional_links=additional_links, individual_content=allContent, # Individual URL -> content mapping debug_info={ "crawl_depth": effective_depth, "max_configured_depth": max_depth, "max_links_per_domain": max_links_per_domain, "crawl_timeout_minutes": crawl_timeout_minutes, "total_urls_crawled": len(allContent), "main_urls": len(selectedWebsites), "additional_urls": len(additional_links) } ) document = WebResearchActionDocument( documentName=f"web_research_{request.user_prompt[:50]}.json", documentData=documentData, mimeType="application/json" ) return WebResearchActionResult( success=True, documents=[document], resultLabel="web_research_results" ) except Exception as e: logger.error(f"Error in web research: {str(e)}") return WebResearchActionResult(success=False, error=str(e))