gateway/modules/aicore/aicorePluginTavily.py

555 lines
22 KiB
Python

"""Tavily web search class.
"""
import logging
import asyncio
import re
from dataclasses import dataclass
from typing import Optional, List, Dict
from tavily import AsyncTavilyClient
from modules.shared.configuration import APP_CONFIG
from modules.aicore.aicoreBase import BaseConnectorAi
from modules.datamodels.datamodelAi import AiModel, PriorityEnum, ProcessingModeEnum, OperationTypeEnum, AiModelCall, AiModelResponse, createOperationTypeRatings, AiCallPromptWebSearch, AiCallPromptWebCrawl
from modules.datamodels.datamodelTools import CountryCodes
logger = logging.getLogger(__name__)
@dataclass
class WebSearchResult:
title: str
url: str
rawContent: Optional[str] = None
@dataclass
class WebCrawlResult:
url: str
content: str
title: Optional[str] = None
class ConnectorWeb(BaseConnectorAi):
"""Tavily web search connector."""
def __init__(self):
super().__init__()
self.client: Optional[AsyncTavilyClient] = None
# Cached settings loaded at initialization time
self.crawlTimeout: int = 30
self.crawlMaxRetries: int = 3
self.crawlRetryDelay: int = 2
# Cached web search constraints (camelCase per project style)
self.webSearchMinResults: int = 1
self.webSearchMaxResults: int = 20
# Initialize client if API key is available
self._initializeClient()
def _initializeClient(self):
"""Initialize the Tavily client if API key is available."""
try:
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
if apiKey:
self.client = AsyncTavilyClient(api_key=apiKey)
logger.info("Tavily client initialized successfully")
else:
logger.warning("Tavily API key not found, client not initialized")
except Exception as e:
logger.error(f"Failed to initialize Tavily client: {str(e)}")
def getConnectorType(self) -> str:
"""Get the connector type identifier."""
return "tavily"
def _convertIsoCodeToCountryName(self, isoCode: str) -> str:
"""
Convert ISO-2 country code to Tavily country name.
Uses centralized CountryCodes mapping.
"""
return CountryCodes.getForTavily(isoCode)
def _extractUrlsFromPrompt(self, prompt: str) -> List[str]:
"""Extract URLs from a text prompt using regex."""
if not prompt:
return []
# URL regex pattern - matches http/https URLs
urlPattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?'
urls = re.findall(urlPattern, prompt)
# Remove duplicates while preserving order
seen = set()
uniqueUrls = []
for url in urls:
if url not in seen:
seen.add(url)
uniqueUrls.append(url)
return uniqueUrls
def _normalizeUrl(self, url: str) -> str:
"""
Normalize URL for better deduplication.
Removes common variations that represent the same content.
"""
if not url:
return url
# Remove trailing slashes
url = url.rstrip('/')
# Remove common query parameters that don't affect content
import urllib.parse
parsed = urllib.parse.urlparse(url)
# Remove common tracking parameters
queryParams = urllib.parse.parse_qs(parsed.query)
filteredParams = {}
for key, values in queryParams.items():
# Keep important parameters, remove tracking ones
if key.lower() not in ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'fbclid', 'gclid', 'ref', 'source', 'campaign']:
filteredParams[key] = values
# Rebuild query string
filteredQuery = urllib.parse.urlencode(filteredParams, doseq=True)
# Reconstruct URL
normalized = urllib.parse.urlunparse((
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
filteredQuery,
parsed.fragment
))
return normalized
def _calculateRelevanceScore(self, result: WebSearchResult, queryWords: set) -> float:
"""
Calculate relevance score for a search result.
Higher score means more relevant to the query.
"""
score = 0.0
# Title relevance (most important)
titleWords = set(result.title.lower().split())
titleMatches = len(queryWords.intersection(titleWords))
score += titleMatches * 3.0 # Weight title matches heavily
# URL relevance
urlWords = set(result.url.lower().split('/'))
urlMatches = len(queryWords.intersection(urlWords))
score += urlMatches * 1.5
# Content relevance (if available)
if hasattr(result, 'rawContent') and result.rawContent:
contentWords = set(result.rawContent.lower().split())
contentMatches = len(queryWords.intersection(contentWords))
score += contentMatches * 0.1 # Lower weight for content matches
# Domain authority bonus (simple heuristic)
domain = result.url.split('/')[2] if '/' in result.url else result.url
if any(authDomain in domain.lower() for authDomain in
['wikipedia.org', 'github.com', 'stackoverflow.com', 'reddit.com', 'medium.com']):
score += 1.0
# Penalty for very long URLs (often less relevant)
if len(result.url) > 100:
score -= 0.5
return score
def _intelligentUrlFiltering(self, searchResults: List[WebSearchResult], query: str, maxResults: int) -> List[WebSearchResult]:
"""
Intelligent URL filtering with de-duplication and relevance scoring.
Args:
searchResults: Raw search results from Tavily
query: Original search query for relevance scoring
maxResults: Maximum number of results to return
Returns:
Filtered and deduplicated list of search results
"""
if not searchResults:
return []
# Step 1: Basic de-duplication by URL
seenUrls = set()
uniqueResults = []
for result in searchResults:
# Normalize URL for better deduplication
normalizedUrl = self._normalizeUrl(result.url)
if normalizedUrl not in seenUrls:
seenUrls.add(normalizedUrl)
uniqueResults.append(result)
logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original")
# Step 2: Relevance scoring and filtering
scoredResults = []
queryWords = set(query.lower().split())
for result in uniqueResults:
score = self._calculateRelevanceScore(result, queryWords)
scoredResults.append((score, result))
# Step 3: Sort by relevance score (higher is better)
scoredResults.sort(key=lambda x: x[0], reverse=True)
# Step 4: Take top results
filteredResults = [result for score, result in scoredResults[:maxResults]]
logger.info(f"After intelligent filtering: {len(filteredResults)} results selected from {len(uniqueResults)} unique")
return filteredResults
def getModels(self) -> List[AiModel]:
"""Get all available Tavily models."""
return [
AiModel(
name="tavily-search",
displayName="Tavily Search & Research",
connectorType="tavily",
apiUrl="https://api.tavily.com",
temperature=0.0, # Web search doesn't use temperature
maxTokens=0, # Web search doesn't use tokens
contextLength=0,
costPer1kTokensInput=0.0,
costPer1kTokensOutput=0.0,
speedRating=8, # Good speed for search and extract
qualityRating=9, # Excellent quality for web research
# capabilities removed (not used in business logic)
functionCall=self._routeWebOperation,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.WEB_SEARCH, 9),
(OperationTypeEnum.WEB_CRAWL, 8)
),
version="tavily-search",
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate
)
]
@classmethod
async def create(cls):
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
if not apiKey:
raise ValueError("Tavily API key not configured. Please set Connector_AiTavily_API_SECRET in config.ini")
# Load and cache web crawl related configuration
crawlTimeout = int(APP_CONFIG.get("Web_Crawl_TIMEOUT", "30"))
crawlMaxRetries = int(APP_CONFIG.get("Web_Crawl_MAX_RETRIES", "3"))
crawlRetryDelay = int(APP_CONFIG.get("Web_Crawl_RETRY_DELAY", "2"))
return cls(
client=AsyncTavilyClient(api_key=apiKey),
crawlTimeout=crawlTimeout,
crawlMaxRetries=crawlMaxRetries,
crawlRetryDelay=crawlRetryDelay,
webSearchMinResults=int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1")),
webSearchMaxResults=int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20")),
)
# Standardized method using AiModelCall/AiModelResponse pattern
def _cleanUrl(self, url: str) -> str:
"""Clean URL by removing extra text that might be appended."""
import re
# Extract just the URL part, removing any extra text after it
urlMatch = re.match(r'(https?://[^\s,]+)', url)
if urlMatch:
return urlMatch.group(1)
return url
async def _search(
self,
query: str,
maxResults: int,
searchDepth: str | None = None,
timeRange: str | None = None,
topic: str | None = None,
includeDomains: list[str] | None = None,
excludeDomains: list[str] | None = None,
language: str | None = None,
country: str | None = None,
includeAnswer: bool | None = None,
includeRawContent: bool | None = None,
) -> list[WebSearchResult]:
"""Calls the Tavily API to perform a web search."""
# Make sure maxResults is within the allowed range (use cached values)
minResults = self.webSearchMinResults
maxAllowedResults = self.webSearchMaxResults
if maxResults < minResults or maxResults > maxAllowedResults:
raise ValueError(f"maxResults must be between {minResults} and {maxAllowedResults}")
# Perform actual API call
# Build kwargs only for provided options to avoid API rejections
kwargs: dict = {"query": query, "max_results": maxResults}
if searchDepth is not None:
kwargs["search_depth"] = searchDepth
if timeRange is not None:
kwargs["time_range"] = timeRange
if topic is not None:
kwargs["topic"] = topic
if includeDomains is not None and len(includeDomains) > 0:
kwargs["include_domains"] = includeDomains
if excludeDomains is not None:
kwargs["exclude_domains"] = excludeDomains
if language is not None:
kwargs["language"] = language
if country is not None:
kwargs["country"] = country
if includeAnswer is not None:
kwargs["include_answer"] = includeAnswer
if includeRawContent is not None:
kwargs["include_raw_content"] = includeRawContent
logger.debug(f"Tavily.search kwargs: {kwargs}")
# Ensure client is initialized
if self.client is None:
self._initializeClient()
if self.client is None:
raise ValueError("Tavily client not initialized. Please check API key configuration.")
response = await self.client.search(**kwargs)
return [
WebSearchResult(
title=result["title"],
url=self._cleanUrl(result["url"]),
rawContent=result.get("raw_content")
)
for result in response["results"]
]
async def _crawl(
self,
urls: list,
extractDepth: str | None = None,
format: str | None = None,
) -> list[WebCrawlResult]:
"""Calls the Tavily API to extract text content from URLs with retry logic."""
maxRetries = self.crawlMaxRetries
retryDelay = self.crawlRetryDelay
timeout = self.crawlTimeout
logger.debug(f"Starting crawl of {len(urls)} URLs: {urls}")
logger.debug(f"Crawl settings: extractDepth={extractDepth}, format={format}, timeout={timeout}s")
for attempt in range(maxRetries + 1):
try:
logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}")
# Use asyncio.wait_for for timeout
# Build kwargs for extract
kwargsExtract: dict = {"urls": urls}
kwargsExtract["extract_depth"] = extractDepth or "advanced"
kwargsExtract["format"] = format or "markdown" # Use markdown to get HTML structure
logger.debug(f"Sending request to Tavily with kwargs: {kwargsExtract}")
# Ensure client is initialized
if self.client is None:
self._initializeClient()
if self.client is None:
raise ValueError("Tavily client not initialized. Please check API key configuration.")
response = await asyncio.wait_for(
self.client.extract(**kwargsExtract),
timeout=timeout
)
logger.debug(f"Tavily response received: {list(response.keys())}")
# Debug: Log what Tavily actually returns
if "results" in response and response["results"]:
logger.debug(f"Tavily returned {len(response['results'])} results")
logger.debug(f"First result keys: {list(response['results'][0].keys())}")
logger.debug(f"First result has raw_content: {'raw_content' in response['results'][0]}")
# Log each result
for i, result in enumerate(response["results"]):
logger.debug(f"Result {i+1}: URL={result.get('url', 'N/A')}, content_length={len(result.get('raw_content', result.get('content', '')))}")
else:
logger.warning(f"Tavily returned no results in response: {response}")
results = [
WebCrawlResult(
url=result["url"],
content=result.get("raw_content", result.get("content", "")), # Try raw_content first, fallback to content
title=result.get("title", "") # Extract title if available
)
for result in response["results"]
]
logger.debug(f"Crawl successful: extracted {len(results)} results")
return results
except asyncio.TimeoutError:
logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URLs: {urls}")
if attempt < maxRetries:
logger.info(f"Retrying in {retryDelay} seconds...")
await asyncio.sleep(retryDelay)
else:
raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout")
except Exception as e:
logger.warning(f"Crawl attempt {attempt + 1} failed for URLs {urls}: {str(e)}")
logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")
# Check if it's a validation error and log more details
if "validation" in str(e).lower():
logger.debug(f"URL validation failed. Checking URL format:")
for i, url in enumerate(urls):
logger.debug(f" URL {i+1}: '{url}' (length: {len(url)})")
# Check for common URL issues
if ' ' in url:
logger.debug(f" WARNING: URL contains spaces!")
if not url.startswith(('http://', 'https://')):
logger.debug(f" WARNING: URL doesn't start with http/https!")
if len(url) > 2000:
logger.debug(f" WARNING: URL is very long ({len(url)} chars)")
if attempt < maxRetries:
logger.info(f"Retrying in {retryDelay} seconds...")
await asyncio.sleep(retryDelay)
else:
raise Exception(f"Crawl failed after {maxRetries + 1} attempts: {str(e)}")
async def _routeWebOperation(self, modelCall: AiModelCall) -> "AiModelResponse":
"""
Route web operation based on operation type.
Args:
modelCall: AiModelCall with messages and options
Returns:
AiModelResponse based on operation type
"""
operationType = modelCall.options.operationType
if operationType == OperationTypeEnum.WEB_SEARCH:
return await self.webSearch(modelCall)
elif operationType == OperationTypeEnum.WEB_CRAWL:
return await self.webCrawl(modelCall)
else:
# Unsupported operation type
return AiModelResponse(
content="",
success=False,
error=f"Unsupported operation type: {operationType}"
)
async def webSearch(self, modelCall: AiModelCall) -> "AiModelResponse":
"""
WEB_SEARCH operation - returns list of URLs using Tavily search.
Args:
modelCall: AiModelCall with AiCallPromptWebSearch as prompt
Returns:
AiModelResponse with JSON list of URLs
"""
try:
# Extract parameters
promptContent = modelCall.messages[0]["content"] if modelCall.messages else ""
import json
promptData = json.loads(promptContent)
# Create Pydantic model
webSearchPrompt = AiCallPromptWebSearch(**promptData)
# Convert ISO country code to country name for Tavily
countryName = webSearchPrompt.country
if countryName:
countryName = self._convertIsoCodeToCountryName(countryName)
# Perform search
searchResults = await self._search(
query=webSearchPrompt.instruction,
maxResults=webSearchPrompt.maxNumberPages,
timeRange=webSearchPrompt.timeRange,
country=countryName,
language=webSearchPrompt.language,
includeAnswer=False,
includeRawContent=False
)
# Extract URLs from results
urls = [result.url for result in searchResults]
# Return as JSON array
import json
return AiModelResponse(
content=json.dumps(urls, indent=2),
success=True,
metadata={"total_urls": len(urls), "operation": "WEB_SEARCH"}
)
except Exception as e:
logger.error(f"Error in Tavily web search: {str(e)}")
return AiModelResponse(
content="[]",
success=False,
error=str(e)
)
async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse":
"""
WEB_CRAWL operation - crawls one URL using Tavily.
Args:
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
Returns:
AiModelResponse with crawl results as JSON
"""
try:
# Extract parameters
promptContent = modelCall.messages[0]["content"] if modelCall.messages else ""
import json
promptData = json.loads(promptContent)
# Create Pydantic model
webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
# Perform crawl for ONE URL
# Note: _crawl expects a list, so we wrap the single URL in a list
crawlResults = await self._crawl(
urls=[webCrawlPrompt.url],
extractDepth="advanced" if webCrawlPrompt.maxDepth > 2 else "basic",
format="markdown"
)
# Format result for single URL - consistent with Perplexity format
if crawlResults and len(crawlResults) > 0:
firstResult = crawlResults[0]
resultData = {
"url": firstResult.url,
"title": firstResult.title if firstResult.title else "Content",
"content": firstResult.content
}
else:
resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted"}
# Return as JSON - same format as Perplexity
import json
return AiModelResponse(
content=json.dumps(resultData, indent=2),
success=True,
metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url}
)
except Exception as e:
logger.error(f"Error in Tavily web crawl: {str(e)}")
import json
errorResult = {"error": str(e), "url": ""}
return AiModelResponse(
content=json.dumps(errorResult, indent=2),
success=False,
error=str(e)
)