gateway/modules/aicore/aicorePluginTavily.py
2025-10-25 22:27:27 +02:00

1321 lines
56 KiB
Python

"""Tavily web search class.
"""
import logging
import asyncio
import re
from dataclasses import dataclass
from typing import Optional, List, Dict
from tavily import AsyncTavilyClient
from modules.shared.configuration import APP_CONFIG
from modules.aicore.aicoreBase import BaseConnectorAi
from modules.datamodels.datamodelAi import AiModel, PriorityEnum, ProcessingModeEnum, OperationTypeEnum, AiModelResponse, createOperationTypeRatings
logger = logging.getLogger(__name__)
@dataclass
class WebSearchResult:
title: str
url: str
raw_content: Optional[str] = None
@dataclass
class WebCrawlResult:
url: str
content: str
@dataclass
class WebResearchRequest:
"""Ultra-simplified web research request"""
user_prompt: str
urls: Optional[List[str]] = None
max_results: int = 5
max_pages: int = 10
search_depth: str = "basic"
extract_depth: str = "advanced"
format: str = "markdown"
country: Optional[str] = None
time_range: Optional[str] = None
topic: Optional[str] = None
language: Optional[str] = None
@dataclass
class WebResearchResult:
"""Ultra-simplified web research result - just success/error + documents"""
success: bool = True
error: Optional[str] = None
documents: List[dict] = None # Simple dict instead of ActionDocument
def __post_init__(self):
if self.documents is None:
self.documents = []
class ConnectorWeb(BaseConnectorAi):
"""Tavily web search connector."""
def __init__(self):
super().__init__()
self.client: Optional[AsyncTavilyClient] = None
# Cached settings loaded at initialization time
self.crawlTimeout: int = 30
self.crawlMaxRetries: int = 3
self.crawlRetryDelay: int = 2
# Cached web search constraints (camelCase per project style)
self.webSearchMinResults: int = 1
self.webSearchMaxResults: int = 20
# Initialize client if API key is available
self._initializeClient()
def _initializeClient(self):
"""Initialize the Tavily client if API key is available."""
try:
api_key = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
if api_key:
self.client = AsyncTavilyClient(api_key=api_key)
logger.info("Tavily client initialized successfully")
else:
logger.warning("Tavily API key not found, client not initialized")
except Exception as e:
logger.error(f"Failed to initialize Tavily client: {str(e)}")
def getConnectorType(self) -> str:
"""Get the connector type identifier."""
return "tavily"
def _extractUrlsFromPrompt(self, prompt: str) -> List[str]:
"""Extract URLs from a text prompt using regex."""
if not prompt:
return []
# URL regex pattern - matches http/https URLs
url_pattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?'
urls = re.findall(url_pattern, prompt)
# Remove duplicates while preserving order
seen = set()
unique_urls = []
for url in urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)
return unique_urls
def _intelligentUrlFiltering(self, searchResults: List[WebSearchResult], query: str, maxResults: int) -> List[WebSearchResult]:
"""
Intelligent URL filtering with de-duplication and relevance scoring.
Args:
searchResults: Raw search results from Tavily
query: Original search query for relevance scoring
maxResults: Maximum number of results to return
Returns:
Filtered and deduplicated list of search results
"""
if not searchResults:
return []
# Step 1: Basic de-duplication by URL
seenUrls = set()
uniqueResults = []
for result in searchResults:
# Normalize URL for better deduplication
normalizedUrl = self._normalizeUrl(result.url)
if normalizedUrl not in seenUrls:
seenUrls.add(normalizedUrl)
uniqueResults.append(result)
logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original")
# Step 2: Relevance scoring and filtering
scoredResults = []
queryWords = set(query.lower().split())
for result in uniqueResults:
score = self._calculateRelevanceScore(result, queryWords)
scoredResults.append((score, result))
# Step 3: Sort by relevance score (higher is better)
scoredResults.sort(key=lambda x: x[0], reverse=True)
# Step 4: Take top results
filteredResults = [result for score, result in scoredResults[:maxResults]]
logger.info(f"After intelligent filtering: {len(filteredResults)} results selected from {len(uniqueResults)} unique")
return filteredResults
def _normalizeUrl(self, url: str) -> str:
"""
Normalize URL for better deduplication.
Removes common variations that represent the same content.
"""
if not url:
return url
# Remove trailing slashes
url = url.rstrip('/')
# Remove common query parameters that don't affect content
import urllib.parse
parsed = urllib.parse.urlparse(url)
# Remove common tracking parameters
queryParams = urllib.parse.parse_qs(parsed.query)
filteredParams = {}
for key, values in queryParams.items():
# Keep important parameters, remove tracking ones
if key.lower() not in ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'fbclid', 'gclid', 'ref', 'source', 'campaign']:
filteredParams[key] = values
# Rebuild query string
filteredQuery = urllib.parse.urlencode(filteredParams, doseq=True)
# Reconstruct URL
normalized = urllib.parse.urlunparse((
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
filteredQuery,
parsed.fragment
))
return normalized
def _calculateRelevanceScore(self, result: WebSearchResult, queryWords: set) -> float:
"""
Calculate relevance score for a search result.
Higher score means more relevant to the query.
"""
score = 0.0
# Title relevance (most important)
titleWords = set(result.title.lower().split())
titleMatches = len(queryWords.intersection(titleWords))
score += titleMatches * 3.0 # Weight title matches heavily
# URL relevance
urlWords = set(result.url.lower().split('/'))
urlMatches = len(queryWords.intersection(urlWords))
score += urlMatches * 1.5
# Content relevance (if available)
if hasattr(result, 'raw_content') and result.raw_content:
contentWords = set(result.raw_content.lower().split())
contentMatches = len(queryWords.intersection(contentWords))
score += contentMatches * 0.1 # Lower weight for content matches
# Domain authority bonus (simple heuristic)
domain = result.url.split('/')[2] if '/' in result.url else result.url
if any(auth_domain in domain.lower() for auth_domain in
['wikipedia.org', 'github.com', 'stackoverflow.com', 'reddit.com', 'medium.com']):
score += 1.0
# Penalty for very long URLs (often less relevant)
if len(result.url) > 100:
score -= 0.5
return score
async def _optimizeSearchQuery(self, query: str, timeRange: str = None, country: str = None, language: str = None) -> tuple[str, dict]:
"""
Use AI to optimize search query and parameters (from old SubWebResearch).
Args:
query: Original search query
timeRange: Time range filter
country: Country filter
language: Language filter
Returns:
Tuple of (optimized_query, optimized_parameters)
"""
try:
# Create AI prompt for query optimization (from old code)
queryOptimizerPrompt = f"""You are a search query optimizer.
USER QUERY: {query}
Your task: Create a search query and parameters for the USER QUERY given.
RULES:
1. The search query MUST be related to the user query above
2. Extract key terms from the user query
3. Determine appropriate country/language based on the query context
4. Keep search query short (2-6 words)
Return ONLY this JSON format:
{{
"user_prompt": "search query based on user query above",
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
"language": "language_code_or_null",
"topic": "general|news|academic_or_null",
"time_range": "d|w|m|y_or_null",
"selection_strategy": "single|multiple|specific_page",
"selection_criteria": "what URLs to prioritize",
"expected_url_patterns": ["pattern1", "pattern2"],
"estimated_result_count": number
}}"""
# Use AI to optimize the query
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
aiRequest = AiCallRequest(
prompt=queryOptimizerPrompt,
options=AiCallOptions()
)
# Get AI response (this would need to be called through the AI interface)
# For now, return the original query with basic optimization
logger.info(f"AI query optimization requested for: '{query}'")
# Basic optimization fallback
optimizedQuery = query
optimizedParams = {
"time_range": timeRange,
"country": country,
"language": language,
"topic": "general"
}
return optimizedQuery, optimizedParams
except Exception as e:
logger.warning(f"Query optimization failed: {str(e)}, using original query")
return query, {"time_range": timeRange, "country": country, "language": language}
async def _aiBasedUrlSelection(self, searchResults: List[WebSearchResult], originalQuery: str, maxResults: int) -> List[WebSearchResult]:
"""
Use AI to select the most relevant URLs from search results (from old SubWebResearch).
Args:
searchResults: Raw search results from Tavily
originalQuery: Original user query for context
maxResults: Maximum number of results to return
Returns:
AI-selected and filtered list of search results
"""
try:
if not searchResults:
return []
# Step 1: Basic de-duplication
seenUrls = set()
uniqueResults = []
for result in searchResults:
normalizedUrl = self._normalizeUrl(result.url)
if normalizedUrl not in seenUrls:
seenUrls.add(normalizedUrl)
uniqueResults.append(result)
logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original")
if len(uniqueResults) <= maxResults:
return uniqueResults
# Step 2: AI-based URL selection (from old code)
logger.info(f"AI selecting most relevant {maxResults} URLs from {len(uniqueResults)} unique results")
# Create AI prompt for URL selection (from old code)
urlList = "\n".join([f"{i+1}. {result.url}" for i, result in enumerate(uniqueResults)])
aiPrompt = f"""Select the most relevant URLs from these search results:
{urlList}
Return only the URLs that are most relevant for the user's query: "{originalQuery}"
One URL per line.
"""
# For now, use intelligent filtering as fallback
# In a full implementation, this would call the AI interface
logger.info("Using intelligent filtering as AI selection fallback")
# Use the existing intelligent filtering
filteredResults = self._intelligentUrlFiltering(uniqueResults, originalQuery, maxResults)
logger.info(f"AI-based selection completed: {len(filteredResults)} results selected")
return filteredResults
except Exception as e:
logger.warning(f"AI-based URL selection failed: {str(e)}, using intelligent filtering")
return self._intelligentUrlFiltering(searchResults, originalQuery, maxResults)
def getModels(self) -> List[AiModel]:
"""Get all available Tavily models."""
return [
AiModel(
name="tavily-search",
displayName="Tavily Search",
connectorType="tavily",
apiUrl="https://api.tavily.com/search",
temperature=0.0, # Web search doesn't use temperature
maxTokens=0, # Web search doesn't use tokens
contextLength=0,
costPer1kTokensInput=0.0,
costPer1kTokensOutput=0.0,
speedRating=9, # Very fast for URL discovery
qualityRating=9, # Excellent URL discovery quality
# capabilities removed (not used in business logic)
functionCall=self.callWebOperation,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.WEB_SEARCH, 10),
(OperationTypeEnum.WEB_RESEARCH, 3),
(OperationTypeEnum.WEB_CRAWL, 2),
(OperationTypeEnum.WEB_NEWS, 3),
(OperationTypeEnum.WEB_QUESTIONS, 2)
),
version="tavily-search",
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived, searchDepth="basic", numRequests=1: numRequests * (1 if searchDepth == "basic" else 2) * 0.008
),
AiModel(
name="tavily-extract",
displayName="Tavily Extract",
connectorType="tavily",
apiUrl="https://api.tavily.com/extract",
temperature=0.0, # Web crawling doesn't use temperature
maxTokens=0, # Web crawling doesn't use tokens
contextLength=0,
costPer1kTokensInput=0.0,
costPer1kTokensOutput=0.0,
speedRating=7, # Good for content extraction
qualityRating=9, # Excellent content extraction quality
# capabilities removed (not used in business logic)
functionCall=self.callWebOperation,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.WEB_CRAWL, 10),
(OperationTypeEnum.WEB_RESEARCH, 3),
(OperationTypeEnum.WEB_NEWS, 3),
(OperationTypeEnum.WEB_QUESTIONS, 2)
),
version="tavily-extract",
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived, numPages=10, extractionDepth="basic", withInstructions=False, numSuccessfulExtractions=10: ((numPages / 10) * (2 if withInstructions else 1) + (numSuccessfulExtractions / 5) * (1 if extractionDepth == "basic" else 2)) * 0.008
),
AiModel(
name="tavily-search-extract",
displayName="Tavily Search & Extract",
connectorType="tavily",
apiUrl="https://api.tavily.com/search",
temperature=0.0, # Web scraping doesn't use temperature
maxTokens=0, # Web scraping doesn't use tokens
contextLength=0,
costPer1kTokensInput=0.0,
costPer1kTokensOutput=0.0,
speedRating=7, # Good for combined search+extract
qualityRating=8, # Good quality for structured data
# capabilities removed (not used in business logic)
functionCall=self.callWebOperation,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.WEB_RESEARCH, 8),
(OperationTypeEnum.WEB_SEARCH, 6),
(OperationTypeEnum.WEB_CRAWL, 6),
(OperationTypeEnum.WEB_NEWS, 5),
(OperationTypeEnum.WEB_QUESTIONS, 5)
),
version="tavily-search-extract",
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived, searchDepth="basic", numSuccessfulUrls=1, extractionDepth="basic": ((1 if searchDepth == "basic" else 2) + (numSuccessfulUrls / 5) * (1 if extractionDepth == "basic" else 2)) * 0.008
)
]
@classmethod
async def create(cls):
api_key = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
if not api_key:
raise ValueError("Tavily API key not configured. Please set Connector_AiTavily_API_SECRET in config.ini")
# Load and cache web crawl related configuration
crawlTimeout = int(APP_CONFIG.get("Web_Crawl_TIMEOUT", "30"))
crawlMaxRetries = int(APP_CONFIG.get("Web_Crawl_MAX_RETRIES", "3"))
crawlRetryDelay = int(APP_CONFIG.get("Web_Crawl_RETRY_DELAY", "2"))
return cls(
client=AsyncTavilyClient(api_key=api_key),
crawlTimeout=crawlTimeout,
crawlMaxRetries=crawlMaxRetries,
crawlRetryDelay=crawlRetryDelay,
webSearchMinResults=int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1")),
webSearchMaxResults=int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20")),
)
# Standardized method using AiModelCall/AiModelResponse pattern
async def callWebOperation(self, modelCall) -> "AiModelResponse":
"""
Universal web operation handler that distributes to the correct method
based on the operationType from AiCallOptions.
"""
try:
options = modelCall.options
operationType = getattr(options, "operationType", None)
if operationType == OperationTypeEnum.WEB_SEARCH:
return await self.search(modelCall)
elif operationType == OperationTypeEnum.WEB_CRAWL:
return await self.crawl(modelCall)
elif operationType in [OperationTypeEnum.WEB_RESEARCH, OperationTypeEnum.WEB_QUESTIONS, OperationTypeEnum.WEB_NEWS]:
return await self.research(modelCall)
else:
# Fallback to search for unknown operation types
return await self.search(modelCall)
except Exception as e:
return AiModelResponse(
content="",
success=False,
error=str(e)
)
async def search(self, modelCall) -> "AiModelResponse":
"""Search using standardized AiModelCall/AiModelResponse pattern"""
try:
# Extract parameters from modelCall
prompt_content = modelCall.messages[0]["content"] if modelCall.messages else ""
options = modelCall.options
# Parse unified prompt JSON format
import json
promptData = json.loads(prompt_content)
# Extract parameters from unified prompt JSON
query = promptData.get("searchPrompt", prompt_content)
maxResults = promptData.get("maxResults", 5)
timeRange = promptData.get("timeRange")
country = promptData.get("country")
language = promptData.get("language")
# Use basic search depth for web search operations
searchDepth = "basic"
# Step 1: AI Query Optimization (from old SubWebResearch)
optimizedQuery, optimizedParams = await self._optimizeSearchQuery(query, timeRange, country, language)
# Step 2: Get more results than requested to allow for intelligent filtering
searchResults = await self._search(
query=optimizedQuery,
max_results=min(maxResults * 3, 30), # Get more results for better AI selection
search_depth=searchDepth,
time_range=optimizedParams.get("time_range", timeRange),
country=optimizedParams.get("country", country),
language=optimizedParams.get("language", language),
include_answer=getattr(options, "include_answer", True),
include_raw_content=getattr(options, "include_raw_content", True),
)
# Step 3: AI-based URL selection and intelligent filtering
filteredResults = await self._aiBasedUrlSelection(searchResults, query, maxResults)
# Convert to JSON string
resultsJson = {
"query": query,
"results": [
{
"title": result.title,
"url": result.url,
"content": getattr(result, 'raw_content', None)
}
for result in filteredResults
],
"total_count": len(filteredResults),
"original_count": len(searchResults),
"filtered_count": len(searchResults) - len(filteredResults)
}
import json
content = json.dumps(resultsJson, indent=2)
return AiModelResponse(
content=content,
success=True,
metadata={
"total_count": len(filteredResults),
"search_depth": searchDepth
}
)
except Exception as e:
return AiModelResponse(
content="",
success=False,
error=str(e)
)
async def crawl(self, modelCall) -> "AiModelResponse":
"""Crawl using standardized AiModelCall/AiModelResponse pattern"""
try:
# Extract parameters from modelCall
promptContent = modelCall.messages[0]["content"] if modelCall.messages else ""
options = modelCall.options
# Parse unified prompt JSON format
import json
promptData = json.loads(promptContent)
# Extract parameters from unified prompt JSON
urls = promptData.get("urls", [])
extractDepth = promptData.get("extractDepth", "advanced")
formatType = promptData.get("format", "markdown")
if not urls:
return AiModelResponse(
content="No URLs provided for crawling",
success=False,
error="No URLs found in prompt data"
)
rawResults = await self._crawl(
urls,
extract_depth=extractDepth,
format=formatType,
)
# Convert to JSON string
resultsJson = {
"urls": urls,
"results": [
{
"url": result.url,
"title": getattr(result, 'title', ''),
"content": result.content,
"extractedAt": getattr(result, 'extracted_at', '')
}
for result in rawResults
],
"total_count": len(rawResults)
}
import json
content = json.dumps(resultsJson, indent=2)
return AiModelResponse(
content=content,
success=True,
metadata={
"total_count": len(rawResults),
"urls_processed": len(urls)
}
)
except Exception as e:
return AiModelResponse(
content="",
success=False,
error=str(e)
)
async def research(self, modelCall) -> "AiModelResponse":
"""
Handle WEB_RESEARCH, WEB_QUESTIONS, WEB_NEWS operations using search + crawl combination.
Single method for all three operation types with different standard settings.
"""
try:
# Extract parameters from modelCall
promptContent = modelCall.messages[0]["content"] if modelCall.messages else ""
options = modelCall.options
operationType = getattr(options, "operationType", None)
# Parse unified prompt JSON format
import json
promptData = json.loads(promptContent)
# Extract parameters based on operation type
if operationType == OperationTypeEnum.WEB_RESEARCH:
query = promptData.get("researchPrompt", promptContent)
maxResults = promptData.get("maxResults", 8)
searchDepth = "basic"
timeRange = promptData.get("timeRange")
country = promptData.get("country")
language = promptData.get("language")
topic = "general"
elif operationType == OperationTypeEnum.WEB_QUESTIONS:
query = promptData.get("question", promptContent)
maxResults = promptData.get("maxResults", 6)
searchDepth = "basic"
timeRange = promptData.get("timeRange")
country = promptData.get("country")
language = promptData.get("language")
topic = "general"
elif operationType == OperationTypeEnum.WEB_NEWS:
query = promptData.get("newsPrompt", promptContent)
maxResults = promptData.get("maxResults", 10)
searchDepth = "basic"
timeRange = promptData.get("timeRange", "w") # Default to week for news
country = promptData.get("country")
language = promptData.get("language")
topic = "news"
else:
# Fallback to research settings
query = promptData.get("researchPrompt", promptContent)
maxResults = promptData.get("maxResults", 5)
searchDepth = "basic"
timeRange = promptData.get("timeRange")
country = promptData.get("country")
language = promptData.get("language")
topic = "general"
logger.info(f"Tavily {operationType} operation: query='{query}', maxResults={maxResults}, topic={topic}")
# Step 1: Search for relevant URLs
searchResults = await self._search(
query=query,
max_results=maxResults * 2, # Get more for better selection
search_depth=searchDepth,
time_range=timeRange,
country=country,
language=language,
topic=topic,
include_answer=True,
include_raw_content=True
)
if not searchResults:
return AiModelResponse(
content="No search results found",
success=False,
error="No relevant URLs found for the query"
)
# Step 2: AI-based URL selection
selectedResults = await self._aiBasedUrlSelection(searchResults, query, maxResults)
if not selectedResults:
return AiModelResponse(
content="No relevant URLs selected",
success=False,
error="AI could not select relevant URLs"
)
# Step 3: Crawl selected URLs for content
urlsToCrawl = [result.url for result in selectedResults]
crawlResults = await self._crawl(
urls=urlsToCrawl,
extract_depth="advanced",
format="markdown"
)
# Step 4: Combine search and crawl results
combinedResults = []
for searchResult in selectedResults:
# Find corresponding crawl result
crawlResult = next((cr for cr in crawlResults if cr.url == searchResult.url), None)
combinedResult = {
"title": searchResult.title,
"url": searchResult.url,
"summary": getattr(searchResult, 'raw_content', ''),
"content": crawlResult.content if crawlResult else '',
"extractedAt": getattr(crawlResult, 'extracted_at', '') if crawlResult else ''
}
combinedResults.append(combinedResult)
# Step 5: Format response based on operation type
if operationType == "WEB_RESEARCH":
responseData = {
"query": query,
"research_results": combinedResults,
"total_count": len(combinedResults),
"operation_type": "research"
}
elif operationType == "WEB_QUESTIONS":
responseData = {
"question": query,
"answer_sources": combinedResults,
"total_count": len(combinedResults),
"operation_type": "questions"
}
elif operationType == "WEB_NEWS":
responseData = {
"news_query": query,
"articles": combinedResults,
"total_count": len(combinedResults),
"operation_type": "news"
}
else:
responseData = {
"query": query,
"results": combinedResults,
"total_count": len(combinedResults),
"operation_type": operationType
}
import json
content = json.dumps(responseData, indent=2)
return AiModelResponse(
content=content,
success=True,
metadata={
"total_count": len(combinedResults),
"urls_searched": len(searchResults),
"urls_crawled": len(crawlResults),
"operation_type": operationType
}
)
except Exception as e:
return AiModelResponse(
content="",
success=False,
error=str(e)
)
async def scrape(self, modelCall) -> "AiModelResponse":
"""Scrape using standardized AiModelCall/AiModelResponse pattern"""
try:
# Extract parameters from modelCall
query = modelCall.messages[0]["content"] if modelCall.messages else ""
options = modelCall.options
search_results = await self._search(
query=query,
max_results=getattr(options, "max_results", 5),
search_depth=getattr(options, "search_depth", None),
time_range=getattr(options, "time_range", None),
topic=getattr(options, "topic", None),
include_domains=getattr(options, "include_domains", None),
exclude_domains=getattr(options, "exclude_domains", None),
language=getattr(options, "language", None),
include_answer=getattr(options, "include_answer", None),
include_raw_content=getattr(options, "include_raw_content", None),
)
urls = [result.url for result in search_results]
crawl_results = await self._crawl(
urls,
extract_depth=getattr(options, "extract_depth", None),
format=getattr(options, "format", None),
)
# Convert to JSON string
results_json = {
"query": query,
"results": [
{
"url": result.url,
"content": result.content
}
for result in crawl_results
],
"total_count": len(crawl_results)
}
import json
content = json.dumps(results_json, indent=2)
return AiModelResponse(
content=content,
success=True,
metadata={
"total_count": len(crawl_results),
"search_depth": getattr(options, "search_depth", "basic"),
"extract_depth": getattr(options, "extract_depth", "basic")
}
)
except Exception as e:
return AiModelResponse(
content="",
success=False,
error=str(e)
)
# Helper Functions
async def _search_urls_raw(self,
*,
query: str,
max_results: int,
search_depth: str | None = None,
time_range: str | None = None,
topic: str | None = None,
include_domains: list[str] | None = None,
exclude_domains: list[str] | None = None,
language: str | None = None,
include_answer: bool | None = None,
include_raw_content: bool | None = None,
) -> list["WebSearchResult"]:
return await self._search(
query=query,
max_results=max_results,
search_depth=search_depth,
time_range=time_range,
topic=topic,
include_domains=include_domains,
exclude_domains=exclude_domains,
language=language,
include_answer=include_answer,
include_raw_content=include_raw_content,
)
async def _crawl_urls_raw(self,
*,
urls: list[str],
extract_depth: str | None = None,
format: str | None = None,
) -> list["WebCrawlResult"]:
return await self._crawl(urls, extract_depth=extract_depth, format=format)
async def _scrape_raw(self,
*,
query: str,
max_results: int,
search_depth: str | None = None,
time_range: str | None = None,
topic: str | None = None,
include_domains: list[str] | None = None,
exclude_domains: list[str] | None = None,
language: str | None = None,
include_answer: bool | None = None,
include_raw_content: bool | None = None,
extract_depth: str | None = None,
format: str | None = None,
) -> list["WebCrawlResult"]:
search_results = await self._search(
query=query,
max_results=max_results,
search_depth=search_depth,
time_range=time_range,
topic=topic,
include_domains=include_domains,
exclude_domains=exclude_domains,
language=language,
include_answer=include_answer,
include_raw_content=include_raw_content,
)
urls = [result.url for result in search_results]
return await self._crawl(urls, extract_depth=extract_depth, format=format)
def _clean_url(self, url: str) -> str:
"""Clean URL by removing extra text that might be appended."""
import re
# Extract just the URL part, removing any extra text after it
url_match = re.match(r'(https?://[^\s,]+)', url)
if url_match:
return url_match.group(1)
return url
async def _search(
self,
query: str,
max_results: int,
search_depth: str | None = None,
time_range: str | None = None,
topic: str | None = None,
include_domains: list[str] | None = None,
exclude_domains: list[str] | None = None,
language: str | None = None,
country: str | None = None,
include_answer: bool | None = None,
include_raw_content: bool | None = None,
) -> list[WebSearchResult]:
"""Calls the Tavily API to perform a web search."""
# Make sure max_results is within the allowed range (use cached values)
minResults = self.webSearchMinResults
maxAllowedResults = self.webSearchMaxResults
if max_results < minResults or max_results > maxAllowedResults:
raise ValueError(f"max_results must be between {minResults} and {maxAllowedResults}")
# Perform actual API call
# Build kwargs only for provided options to avoid API rejections
kwargs: dict = {"query": query, "max_results": max_results}
if search_depth is not None:
kwargs["search_depth"] = search_depth
if time_range is not None:
kwargs["time_range"] = time_range
if topic is not None:
kwargs["topic"] = topic
if include_domains is not None and len(include_domains) > 0:
kwargs["include_domains"] = include_domains
if exclude_domains is not None:
kwargs["exclude_domains"] = exclude_domains
if language is not None:
kwargs["language"] = language
if country is not None:
kwargs["country"] = country
if include_answer is not None:
kwargs["include_answer"] = include_answer
if include_raw_content is not None:
kwargs["include_raw_content"] = include_raw_content
logger.debug(f"Tavily.search kwargs: {kwargs}")
# Ensure client is initialized
if self.client is None:
self._initializeClient()
if self.client is None:
raise ValueError("Tavily client not initialized. Please check API key configuration.")
response = await self.client.search(**kwargs)
return [
WebSearchResult(
title=result["title"],
url=self._clean_url(result["url"]),
raw_content=result.get("raw_content")
)
for result in response["results"]
]
async def _crawl(
self,
urls: list,
extract_depth: str | None = None,
format: str | None = None,
) -> list[WebCrawlResult]:
"""Calls the Tavily API to extract text content from URLs with retry logic."""
maxRetries = self.crawlMaxRetries
retryDelay = self.crawlRetryDelay
timeout = self.crawlTimeout
logger.debug(f"Starting crawl of {len(urls)} URLs: {urls}")
logger.debug(f"Crawl settings: extract_depth={extract_depth}, format={format}, timeout={timeout}s")
for attempt in range(maxRetries + 1):
try:
logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}")
# Use asyncio.wait_for for timeout
# Build kwargs for extract
kwargs_extract: dict = {"urls": urls}
kwargs_extract["extract_depth"] = extract_depth or "advanced"
kwargs_extract["format"] = format or "markdown" # Use markdown to get HTML structure
logger.debug(f"Sending request to Tavily with kwargs: {kwargs_extract}")
# Ensure client is initialized
if self.client is None:
self._initializeClient()
if self.client is None:
raise ValueError("Tavily client not initialized. Please check API key configuration.")
response = await asyncio.wait_for(
self.client.extract(**kwargs_extract),
timeout=timeout
)
logger.debug(f"Tavily response received: {list(response.keys())}")
# Debug: Log what Tavily actually returns
if "results" in response and response["results"]:
logger.debug(f"Tavily returned {len(response['results'])} results")
logger.debug(f"First result keys: {list(response['results'][0].keys())}")
logger.debug(f"First result has raw_content: {'raw_content' in response['results'][0]}")
# Log each result
for i, result in enumerate(response["results"]):
logger.debug(f"Result {i+1}: URL={result.get('url', 'N/A')}, content_length={len(result.get('raw_content', result.get('content', '')))}")
else:
logger.warning(f"Tavily returned no results in response: {response}")
results = [
WebCrawlResult(
url=result["url"],
content=result.get("raw_content", result.get("content", "")) # Try raw_content first, fallback to content
)
for result in response["results"]
]
logger.debug(f"Crawl successful: extracted {len(results)} results")
return results
except asyncio.TimeoutError:
logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URLs: {urls}")
if attempt < maxRetries:
logger.info(f"Retrying in {retryDelay} seconds...")
await asyncio.sleep(retryDelay)
else:
raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout")
except Exception as e:
logger.warning(f"Crawl attempt {attempt + 1} failed for URLs {urls}: {str(e)}")
logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")
# Check if it's a validation error and log more details
if "validation" in str(e).lower():
logger.debug(f"URL validation failed. Checking URL format:")
for i, url in enumerate(urls):
logger.debug(f" URL {i+1}: '{url}' (length: {len(url)})")
# Check for common URL issues
if ' ' in url:
logger.debug(f" WARNING: URL contains spaces!")
if not url.startswith(('http://', 'https://')):
logger.debug(f" WARNING: URL doesn't start with http/https!")
if len(url) > 2000:
logger.debug(f" WARNING: URL is very long ({len(url)} chars)")
if attempt < maxRetries:
logger.info(f"Retrying in {retryDelay} seconds...")
await asyncio.sleep(retryDelay)
else:
raise Exception(f"Crawl failed after {maxRetries + 1} attempts: {str(e)}")
async def comprehensiveWebResearch(self, request: WebResearchRequest) -> WebResearchResult:
"""
Perform comprehensive web research using Tavily's search and extract capabilities.
This method orchestrates the full web research workflow.
"""
try:
logger.info(f"COMPREHENSIVE WEB RESEARCH STARTED")
logger.info(f"User Query: {request.user_prompt}")
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.max_pages}")
# Global URL index to track all processed URLs across the entire research session
global_processed_urls = set()
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
if request.urls:
# Use provided URLs as initial main URLs
websites = request.urls
logger.info(f"Using provided URLs ({len(websites)}):")
for i, url in enumerate(websites, 1):
logger.info(f" {i}. {url}")
else:
# Use AI to determine main URLs based on user's intention
logger.info(f"AI analyzing user intent: '{request.user_prompt}'")
# Use basic search parameters
search_query = request.user_prompt
search_depth = request.search_depth or "basic"
time_range = request.time_range
topic = request.topic
country = request.country
language = request.language
max_results = request.max_results
logger.info(f"Using search parameters: query='{search_query}', depth={search_depth}, time_range={time_range}, topic={topic}")
# Perform web search
search_results = await self._search(
query=search_query,
max_results=max_results,
search_depth=search_depth,
time_range=time_range,
topic=topic,
country=country,
language=language,
include_answer=True,
include_raw_content=True
)
# Extract URLs from search results
websites = [result.url for result in search_results]
logger.info(f"Found {len(websites)} URLs from search")
# AI-based URL selection and deduplication
if len(websites) > request.max_pages:
logger.info(f"AI selecting most relevant {request.max_pages} URLs from {len(websites)} found")
# For now, just take the first max_pages URLs
selected_indices = list(range(min(request.max_pages, len(websites))))
selected_websites = [websites[i] for i in selected_indices]
# Remove duplicates while preserving order
seen = set()
unique_websites = []
for url in selected_websites:
if url not in seen:
seen.add(url)
unique_websites.append(url)
websites = unique_websites
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs")
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
for i, url in enumerate(websites, 1):
logger.info(f" {i}. {url}")
# Step 2: Smart website selection using AI interface
logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")
# For now, just use all websites
selected_websites = websites
logger.debug(f"AI selected {len(selected_websites)} most relevant URLs:")
for i, url in enumerate(selected_websites, 1):
logger.debug(f" {i}. {url}")
# Step 3+4+5: Recursive crawling with configurable depth
# Get configuration parameters
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
# Use the configured max_depth or the request's search_depth, whichever is smaller
effective_depth = min(max_depth, request.search_depth if isinstance(request.search_depth, int) else 2)
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING ===")
logger.info(f"Starting recursive crawl with depth {effective_depth}")
logger.info(f"Max links per domain: {max_links_per_domain}")
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
# Perform recursive crawling
all_content = await self._crawlRecursively(
urls=selected_websites,
max_depth=effective_depth,
extract_depth=request.extract_depth,
max_per_domain=max_links_per_domain,
global_processed_urls=global_processed_urls
)
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
# Step 6: AI analysis of all collected content
logger.info(f"=== STEP 6: AI ANALYSIS ===")
logger.info(f"Analyzing {len(all_content)} websites with AI")
# Create a basic analysis result
analysis_result = f"Web research completed for: {request.user_prompt}\n\n"
analysis_result += f"Analyzed {len(all_content)} websites:\n"
for url, content in all_content.items():
analysis_result += f"- {url}: {len(content)} characters\n"
# Create result documents
import time
result_documents = []
# Main research result
main_document = {
"documentName": f"web_research_{int(time.time())}.json",
"documentData": {
"user_prompt": request.user_prompt,
"websites_analyzed": len(all_content),
"additional_links_found": 0, # Would be calculated from crawl results
"analysis_result": analysis_result,
"sources": [{"title": f"Website {i+1}", "url": url} for i, url in enumerate(all_content.keys())],
"additional_links": [],
"debug_info": {
"total_urls_processed": len(global_processed_urls),
"crawl_depth": effective_depth,
"extract_depth": request.extract_depth
}
},
"mimeType": "application/json"
}
result_documents.append(main_document)
# Individual website content documents
for i, (url, content) in enumerate(all_content.items()):
content_document = {
"documentName": f"website_content_{i+1}.md",
"documentData": content,
"mimeType": "text/markdown"
}
result_documents.append(content_document)
logger.info(f"WEB RESEARCH COMPLETED SUCCESSFULLY")
logger.info(f"Generated {len(result_documents)} result documents")
return WebResearchResult(
success=True,
documents=result_documents
)
except Exception as e:
logger.error(f"Error in comprehensive web research: {str(e)}")
return WebResearchResult(
success=False,
error=str(e),
documents=[]
)
async def _crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
"""
Recursively crawl URLs up to specified depth.
This is a simplified version of the recursive crawling logic.
"""
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
# URL index to track all processed URLs (local + global)
processed_urls = set()
if global_processed_urls is not None:
processed_urls = global_processed_urls
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
else:
logger.info("Using local URL index for this crawl session")
all_content = {}
current_level_urls = urls.copy()
try:
for depth in range(1, max_depth + 1):
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
# URLs found at this level (for next iteration)
next_level_urls = []
for url in current_level_urls:
# Normalize URL for duplicate checking
normalized_url = self._normalizeUrl(url)
if normalized_url in processed_urls:
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
continue
try:
logger.info(f"Processing URL at depth {depth}: {url}")
# Extract content from URL
crawl_results = await self._crawl([url], extract_depth=extract_depth, format="markdown")
if crawl_results and crawl_results[0].content:
content = crawl_results[0].content
all_content[url] = content
processed_urls.add(normalized_url)
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
# For simplicity, we'll skip finding sub-links in this implementation
# In a full implementation, you would extract links and add them to next_level_urls
else:
logger.warning(f"✗ No content extracted from {url}")
processed_urls.add(normalized_url)
except Exception as e:
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
processed_urls.add(normalized_url)
# Prepare for next iteration
current_level_urls = next_level_urls
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
# Stop if no more URLs to process
if not current_level_urls:
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
break
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
return all_content
except Exception as e:
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
return all_content
def _normalizeUrl(self, url: str) -> str:
"""Normalize URL to handle variations that should be considered duplicates."""
if not url:
return url
# Remove trailing slashes and fragments
url = url.rstrip('/')
if '#' in url:
url = url.split('#')[0]
# Handle common URL variations
url = url.replace('http://', 'https://') # Normalize protocol
return url