# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Tavily web search class. """ import logging import asyncio import re from dataclasses import dataclass from typing import Optional, List, Dict from tavily import AsyncTavilyClient from modules.shared.configuration import APP_CONFIG from modules.aicore.aicoreBase import BaseConnectorAi from modules.datamodels.datamodelAi import AiModel, PriorityEnum, ProcessingModeEnum, OperationTypeEnum, AiModelCall, AiModelResponse, createOperationTypeRatings, AiCallPromptWebSearch, AiCallPromptWebCrawl from modules.datamodels.datamodelTools import CountryCodes logger = logging.getLogger(__name__) @dataclass class WebSearchResult: title: str url: str rawContent: Optional[str] = None @dataclass class WebCrawlResult: url: str content: str title: Optional[str] = None class AiTavily(BaseConnectorAi): """Tavily web search connector.""" def __init__(self): super().__init__() self.client: Optional[AsyncTavilyClient] = None # Cached settings loaded at initialization time self.crawlTimeout: int = 30 self.crawlMaxRetries: int = 3 self.crawlRetryDelay: int = 2 # Cached web search constraints (camelCase per project style) self.webSearchMinResults: int = 1 self.webSearchMaxResults: int = 20 # Initialize client if API key is available self._initializeClient() def getModels(self) -> List[AiModel]: """Get all available Tavily models.""" return [ AiModel( name="tavily-search", displayName="Tavily Search & Research", connectorType="tavily", apiUrl="https://api.tavily.com", temperature=0.0, # Web search doesn't use temperature maxTokens=0, # Web search doesn't use tokens contextLength=0, costPer1kTokensInput=0.0, costPer1kTokensOutput=0.0, speedRating=8, # Good speed for search and extract qualityRating=9, # Excellent quality for web research # capabilities removed (not used in business logic) functionCall=self._routeWebOperation, priority=PriorityEnum.BALANCED, processingMode=ProcessingModeEnum.BASIC, operationTypes=createOperationTypeRatings( (OperationTypeEnum.WEB_SEARCH, 9), (OperationTypeEnum.WEB_CRAWL, 10) ), version="tavily-search", calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate ) ] def _initializeClient(self): """Initialize the Tavily client if API key is available.""" try: apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET") if apiKey: self.client = AsyncTavilyClient(api_key=apiKey) logger.info("Tavily client initialized successfully") else: logger.warning("Tavily API key not found, client not initialized") except Exception as e: logger.error(f"Failed to initialize Tavily client: {str(e)}") def getConnectorType(self) -> str: """Get the connector type identifier.""" return "tavily" def _convertIsoCodeToCountryName(self, isoCode: str) -> str: """ Convert ISO-2 country code to Tavily country name. Uses centralized CountryCodes mapping. """ return CountryCodes.getForTavily(isoCode) def _extractUrlsFromPrompt(self, prompt: str) -> List[str]: """Extract URLs from a text prompt using regex.""" if not prompt: return [] # URL regex pattern - matches http/https URLs urlPattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?' urls = re.findall(urlPattern, prompt) # Remove duplicates while preserving order seen = set() uniqueUrls = [] for url in urls: if url not in seen: seen.add(url) uniqueUrls.append(url) return uniqueUrls def _normalizeUrl(self, url: str) -> str: """ Normalize URL for better deduplication. Removes common variations that represent the same content. """ if not url: return url # Remove trailing slashes url = url.rstrip('/') # Remove common query parameters that don't affect content import urllib.parse parsed = urllib.parse.urlparse(url) # Remove common tracking parameters queryParams = urllib.parse.parse_qs(parsed.query) filteredParams = {} for key, values in queryParams.items(): # Keep important parameters, remove tracking ones if key.lower() not in ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'fbclid', 'gclid', 'ref', 'source', 'campaign']: filteredParams[key] = values # Rebuild query string filteredQuery = urllib.parse.urlencode(filteredParams, doseq=True) # Reconstruct URL normalized = urllib.parse.urlunparse(( parsed.scheme, parsed.netloc, parsed.path, parsed.params, filteredQuery, parsed.fragment )) return normalized def _calculateRelevanceScore(self, result: WebSearchResult, queryWords: set) -> float: """ Calculate relevance score for a search result. Higher score means more relevant to the query. """ score = 0.0 # Title relevance (most important) titleWords = set(result.title.lower().split()) titleMatches = len(queryWords.intersection(titleWords)) score += titleMatches * 3.0 # Weight title matches heavily # URL relevance urlWords = set(result.url.lower().split('/')) urlMatches = len(queryWords.intersection(urlWords)) score += urlMatches * 1.5 # Content relevance (if available) if hasattr(result, 'rawContent') and result.rawContent: contentWords = set(result.rawContent.lower().split()) contentMatches = len(queryWords.intersection(contentWords)) score += contentMatches * 0.1 # Lower weight for content matches # Domain authority bonus (simple heuristic) domain = result.url.split('/')[2] if '/' in result.url else result.url if any(authDomain in domain.lower() for authDomain in ['wikipedia.org', 'github.com', 'stackoverflow.com', 'reddit.com', 'medium.com']): score += 1.0 # Penalty for very long URLs (often less relevant) if len(result.url) > 100: score -= 0.5 return score def _intelligentUrlFiltering(self, searchResults: List[WebSearchResult], query: str, maxResults: int) -> List[WebSearchResult]: """ Intelligent URL filtering with de-duplication and relevance scoring. Args: searchResults: Raw search results from Tavily query: Original search query for relevance scoring maxResults: Maximum number of results to return Returns: Filtered and deduplicated list of search results """ if not searchResults: return [] # Step 1: Basic de-duplication by URL seenUrls = set() uniqueResults = [] for result in searchResults: # Normalize URL for better deduplication normalizedUrl = self._normalizeUrl(result.url) if normalizedUrl not in seenUrls: seenUrls.add(normalizedUrl) uniqueResults.append(result) logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original") # Step 2: Relevance scoring and filtering scoredResults = [] queryWords = set(query.lower().split()) for result in uniqueResults: score = self._calculateRelevanceScore(result, queryWords) scoredResults.append((score, result)) # Step 3: Sort by relevance score (higher is better) scoredResults.sort(key=lambda x: x[0], reverse=True) # Step 4: Take top results filteredResults = [result for score, result in scoredResults[:maxResults]] logger.info(f"After intelligent filtering: {len(filteredResults)} results selected from {len(uniqueResults)} unique") return filteredResults @classmethod async def create(cls): apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET") if not apiKey: raise ValueError("Tavily API key not configured. Please set Connector_AiTavily_API_SECRET in config.ini") # Load and cache web crawl related configuration crawlTimeout = int(APP_CONFIG.get("Web_Crawl_TIMEOUT", "30")) crawlMaxRetries = int(APP_CONFIG.get("Web_Crawl_MAX_RETRIES", "3")) crawlRetryDelay = int(APP_CONFIG.get("Web_Crawl_RETRY_DELAY", "2")) return cls( client=AsyncTavilyClient(api_key=apiKey), crawlTimeout=crawlTimeout, crawlMaxRetries=crawlMaxRetries, crawlRetryDelay=crawlRetryDelay, webSearchMinResults=int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1")), webSearchMaxResults=int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20")), ) # Standardized method using AiModelCall/AiModelResponse pattern def _cleanUrl(self, url: str) -> str: """Clean URL by removing extra text that might be appended.""" import re # Extract just the URL part, removing any extra text after it urlMatch = re.match(r'(https?://[^\s,]+)', url) if urlMatch: return urlMatch.group(1) return url async def _search( self, query: str, maxResults: int, searchDepth: str | None = None, timeRange: str | None = None, topic: str | None = None, includeDomains: list[str] | None = None, excludeDomains: list[str] | None = None, country: str | None = None, includeAnswer: str | None = None, includeRawContent: str | None = None, ) -> list[WebSearchResult]: """Calls the Tavily API to perform a web search.""" # Make sure maxResults is within the allowed range (use cached values) minResults = self.webSearchMinResults maxAllowedResults = self.webSearchMaxResults if maxResults < minResults or maxResults > maxAllowedResults: raise ValueError(f"maxResults must be between {minResults} and {maxAllowedResults}") # Perform actual API call # Build kwargs only for provided options to avoid API rejections kwargs: dict = {"query": query, "max_results": maxResults} if searchDepth is not None: kwargs["search_depth"] = searchDepth if timeRange is not None: kwargs["time_range"] = timeRange if topic is not None: kwargs["topic"] = topic if includeDomains is not None and len(includeDomains) > 0: kwargs["include_domains"] = includeDomains if excludeDomains is not None: kwargs["exclude_domains"] = excludeDomains if country is not None: kwargs["country"] = country if includeAnswer is not None: kwargs["include_answer"] = includeAnswer if includeRawContent is not None: kwargs["include_raw_content"] = includeRawContent # Log the final API call parameters for comparison logger.info(f"Tavily API call parameters: {kwargs}") # Ensure client is initialized if self.client is None: self._initializeClient() if self.client is None: raise ValueError("Tavily client not initialized. Please check API key configuration.") response = await self.client.search(**kwargs) # Return all results without score filtering # Tavily's scoring is already applied by the API logger.info(f"Tavily returned {len(response.get('results', []))} results") return [ WebSearchResult( title=result["title"], url=self._cleanUrl(result["url"]), rawContent=result.get("raw_content") ) for result in response["results"] ] async def _crawl( self, url: str, instructions: str | None = None, limit: int = 20, maxDepth: int = 2, maxBreadth: int = 40, ) -> list[WebCrawlResult]: """Calls the Tavily API to crawl ONE URL with link following and retry logic.""" maxRetries = self.crawlMaxRetries retryDelay = self.crawlRetryDelay timeout = self.crawlTimeout logger.debug(f"Starting crawl of URL: {url}") logger.debug(f"Crawl settings: instructions={instructions}, limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s") for attempt in range(maxRetries + 1): try: logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}") # Ensure client is initialized if self.client is None: self._initializeClient() if self.client is None: raise ValueError("Tavily client not initialized. Please check API key configuration.") logger.debug(f"Crawling URL: {url}") # Build kwargs for crawl kwargsCrawl: dict = {"url": url} if instructions: kwargsCrawl["instructions"] = instructions if limit: kwargsCrawl["limit"] = limit if maxDepth: kwargsCrawl["max_depth"] = maxDepth if maxBreadth: kwargsCrawl["max_breadth"] = maxBreadth logger.debug(f"Sending request to Tavily with kwargs: {kwargsCrawl}") response = await asyncio.wait_for( self.client.crawl(**kwargsCrawl), timeout=timeout ) logger.debug(f"Tavily response received: {type(response)}") # Parse response - could be dict with results or list if isinstance(response, dict) and "results" in response: pageResults = response["results"] elif isinstance(response, list): pageResults = response else: logger.warning(f"Unexpected response format: {type(response)}") pageResults = [] logger.debug(f"Got {len(pageResults)} pages from crawl") # Convert to WebCrawlResult format results = [] for result in pageResults: results.append(WebCrawlResult( url=result.get("url", url), content=result.get("raw_content", result.get("content", "")), title=result.get("title", "") )) logger.debug(f"Crawl successful: extracted {len(results)} pages from URL") return results except asyncio.TimeoutError: logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URL: {url}") if attempt < maxRetries: logger.info(f"Retrying in {retryDelay} seconds...") await asyncio.sleep(retryDelay) else: raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout") except Exception as e: logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}") logger.debug(f"Full error details: {type(e).__name__}: {str(e)}") # Check if it's a validation error and log more details if "validation" in str(e).lower(): logger.debug(f"URL validation failed. Checking URL format:") logger.debug(f" URL: '{url}' (length: {len(url)})") # Check for common URL issues if ' ' in url: logger.debug(f" WARNING: URL contains spaces!") if not url.startswith(('http://', 'https://')): logger.debug(f" WARNING: URL doesn't start with http/https!") if len(url) > 2000: logger.debug(f" WARNING: URL is very long ({len(url)} chars)") if attempt < maxRetries: logger.info(f"Retrying in {retryDelay} seconds...") await asyncio.sleep(retryDelay) else: raise Exception(f"Crawl failed after {maxRetries + 1} attempts: {str(e)}") async def _routeWebOperation(self, modelCall: AiModelCall) -> "AiModelResponse": """ Route web operation based on operation type. Args: modelCall: AiModelCall with messages and options Returns: AiModelResponse based on operation type """ operationType = modelCall.options.operationType if operationType == OperationTypeEnum.WEB_SEARCH: return await self.webSearch(modelCall) elif operationType == OperationTypeEnum.WEB_CRAWL: return await self.webCrawl(modelCall) else: # Unsupported operation type return AiModelResponse( content="", success=False, error=f"Unsupported operation type: {operationType}" ) async def webSearch(self, modelCall: AiModelCall) -> "AiModelResponse": """ WEB_SEARCH operation - returns list of URLs using Tavily search. Args: modelCall: AiModelCall with AiCallPromptWebSearch as prompt Returns: AiModelResponse with JSON list of URLs """ try: # Extract parameters - find user message (not system message) promptContent = "" if modelCall.messages: for msg in modelCall.messages: if msg.get("role") == "user": promptContent = msg.get("content", "") break # Fallback to first message if no user message found if not promptContent and len(modelCall.messages) > 0: promptContent = modelCall.messages[0].get("content", "") if not promptContent or not promptContent.strip(): raise ValueError("Empty prompt content received for web search") import json try: promptData = json.loads(promptContent) except json.JSONDecodeError as e: logger.error(f"Failed to parse prompt content as JSON: {promptContent[:200]}") raise ValueError(f"Invalid JSON in prompt content: {str(e)}") # Create Pydantic model webSearchPrompt = AiCallPromptWebSearch(**promptData) # Convert ISO country code to country name for Tavily countryName = webSearchPrompt.country if countryName: countryName = self._convertIsoCodeToCountryName(countryName) # Perform search - use exact parameters from prompt # NOTE: timeRange parameter causes generic results, so we don't use it searchResults = await self._search( query=webSearchPrompt.instruction, maxResults=webSearchPrompt.maxNumberPages, timeRange=None, # Not used - causes generic results country=countryName, includeAnswer="basic", includeRawContent="text" ) # Extract URLs from results urls = [result.url for result in searchResults] # Return as JSON array import json return AiModelResponse( content=json.dumps(urls, indent=2), success=True, metadata={"total_urls": len(urls), "operation": "WEB_SEARCH"} ) except Exception as e: logger.error(f"Error in Tavily web search: {str(e)}") return AiModelResponse( content="[]", success=False, error=str(e) ) async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse": """ WEB_CRAWL operation - crawls one URL using Tavily with link following. Args: modelCall: AiModelCall with AiCallPromptWebCrawl as prompt Returns: AiModelResponse with crawl results as JSON (may include multiple pages) """ try: # Extract parameters - find user message (not system message) promptContent = "" if modelCall.messages: for msg in modelCall.messages: if msg.get("role") == "user": promptContent = msg.get("content", "") break # Fallback to first message if no user message found if not promptContent and len(modelCall.messages) > 0: promptContent = modelCall.messages[0].get("content", "") if not promptContent or not promptContent.strip(): raise ValueError("Empty prompt content received for web crawl") import json try: promptData = json.loads(promptContent) except json.JSONDecodeError as e: logger.error(f"Failed to parse prompt content as JSON: {promptContent[:200]}") raise ValueError(f"Invalid JSON in prompt content: {str(e)}") # Create Pydantic model webCrawlPrompt = AiCallPromptWebCrawl(**promptData) # Perform crawl for ONE URL with link following # Use maxWidth as limit, maxDepth as maxDepth, and calculate maxBreadth crawlResults = await self._crawl( url=webCrawlPrompt.url, instructions=webCrawlPrompt.instruction, limit=webCrawlPrompt.maxWidth or 20, # maxWidth controls number of pages maxDepth=webCrawlPrompt.maxDepth or 2, maxBreadth=webCrawlPrompt.maxWidth or 40 # Use same as limit for breadth ) # If we got multiple pages from the crawl, we need to format them differently # Return the first result for backwards compatibility, but include total page count if crawlResults and len(crawlResults) > 0: # Get all pages content allContent = "" for i, result in enumerate(crawlResults, 1): pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n" if result.title: allContent += f"{pageHeader}Title: {result.title}\n\n" allContent += f"{result.content}\n" resultData = { "url": webCrawlPrompt.url, "title": crawlResults[0].title if crawlResults[0].title else "Content", "content": allContent, "pagesCrawled": len(crawlResults), "pageUrls": [result.url for result in crawlResults] } else: resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0} # Return as JSON - same format as Perplexity but with multiple pages content import json return AiModelResponse( content=json.dumps(resultData, indent=2), success=True, metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url, "pagesCrawled": len(crawlResults) if crawlResults else 0} ) except Exception as e: logger.error(f"Error in Tavily web crawl: {str(e)}") import json errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""} return AiModelResponse( content=json.dumps(errorResult, indent=2), success=False, error=str(e) )