gateway/modules/connectors/connectorAiTavily.py
2025-09-26 23:36:56 +02:00

339 lines
12 KiB
Python

"""Tavily web search class.
"""
import logging
import asyncio
from dataclasses import dataclass
from tavily import AsyncTavilyClient
from modules.shared.configuration import APP_CONFIG
from modules.shared.timezoneUtils import get_utc_timestamp
from modules.datamodels.datamodelWeb import (
WebSearchActionResult,
WebSearchActionDocument,
WebSearchDocumentData,
WebSearchResultItem,
WebCrawlActionResult,
WebCrawlActionDocument,
WebCrawlDocumentData,
WebCrawlResultItem,
WebScrapeActionResult,
WebScrapeActionDocument,
WebSearchDocumentData as WebScrapeDocumentData,
WebScrapeResultItem,
)
logger = logging.getLogger(__name__)
@dataclass
class WebSearchResult:
title: str
url: str
@dataclass
class WebCrawlResult:
url: str
content: str
@dataclass
class ConnectorWeb:
client: AsyncTavilyClient = None
# Cached settings loaded at initialization time
crawl_timeout: int = 30
crawl_max_retries: int = 3
crawl_retry_delay: int = 2
# Cached web search constraints (camelCase per project style)
webSearchMinResults: int = 1
webSearchMaxResults: int = 20
@classmethod
async def create(cls):
api_key = APP_CONFIG.get("Connector_WebTavily_API_KEY_SECRET")
if not api_key:
raise ValueError("Tavily API key not configured. Please set Connector_WebTavily_API_KEY_SECRET in config.ini")
# Load and cache web crawl related configuration
crawl_timeout = int(APP_CONFIG.get("Web_Crawl_TIMEOUT", "30"))
crawl_max_retries = int(APP_CONFIG.get("Web_Crawl_MAX_RETRIES", "3"))
crawl_retry_delay = int(APP_CONFIG.get("Web_Crawl_RETRY_DELAY", "2"))
return cls(
client=AsyncTavilyClient(api_key=api_key),
crawl_timeout=crawl_timeout,
crawl_max_retries=crawl_max_retries,
crawl_retry_delay=crawl_retry_delay,
webSearchMinResults=int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1")),
webSearchMaxResults=int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20")),
)
# Standardized methods returning ActionResults for the interface to consume
async def search(self, request) -> "WebSearchActionResult":
try:
raw_results = await self._search(
query=request.query,
max_results=request.max_results,
search_depth=request.search_depth,
time_range=request.time_range,
topic=request.topic,
include_domains=request.include_domains,
exclude_domains=request.exclude_domains,
language=request.language,
include_answer=request.include_answer,
include_raw_content=request.include_raw_content,
)
except Exception as e:
return WebSearchActionResult(success=False, error=str(e))
result_items = [
WebSearchResultItem(title=result.title, url=result.url)
for result in raw_results
]
document_data = WebSearchDocumentData(
query=request.query,
results=result_items,
total_count=len(result_items),
)
document = WebSearchActionDocument(
documentName=f"web_search_results_{get_utc_timestamp()}.json",
documentData=document_data,
mimeType="application/json",
)
return WebSearchActionResult(
success=True, documents=[document], resultLabel="web_search_results"
)
async def crawl(self, request) -> "WebCrawlActionResult":
try:
raw_results = await self._crawl(
[str(u) for u in request.urls],
extract_depth=request.extract_depth,
format=request.format,
)
except Exception as e:
return WebCrawlActionResult(success=False, error=str(e))
result_items = [
WebCrawlResultItem(url=result.url, content=result.content)
for result in raw_results
]
document_data = WebCrawlDocumentData(
urls=[str(u) for u in request.urls],
results=result_items,
total_count=len(result_items),
)
document = WebCrawlActionDocument(
documentName=f"web_crawl_results_{get_utc_timestamp()}.json",
documentData=document_data,
mimeType="application/json",
)
return WebCrawlActionResult(
success=True, documents=[document], resultLabel="web_crawl_results"
)
async def scrape(self, request) -> "WebScrapeActionResult":
try:
search_results = await self._search(
query=request.query,
max_results=request.max_results,
search_depth=request.search_depth,
time_range=request.time_range,
topic=request.topic,
include_domains=request.include_domains,
exclude_domains=request.exclude_domains,
language=request.language,
include_answer=request.include_answer,
include_raw_content=request.include_raw_content,
)
except Exception as e:
return WebScrapeActionResult(success=False, error=str(e))
try:
urls = [result.url for result in search_results]
crawl_results = await self._crawl(
urls,
extract_depth=request.extract_depth,
format=request.format,
)
except Exception as e:
return WebScrapeActionResult(success=False, error=str(e))
result_items = [
WebScrapeResultItem(url=result.url, content=result.content)
for result in crawl_results
]
document_data = WebScrapeDocumentData(
query=request.query,
results=result_items,
total_count=len(result_items),
)
document = WebScrapeActionDocument(
documentName=f"web_scrape_results_{get_utc_timestamp()}.json",
documentData=document_data,
mimeType="application/json",
)
return WebScrapeActionResult(
success=True, documents=[document], resultLabel="web_scrape_results"
)
async def _search_urls_raw(self,
*,
query: str,
max_results: int,
search_depth: str | None = None,
time_range: str | None = None,
topic: str | None = None,
include_domains: list[str] | None = None,
exclude_domains: list[str] | None = None,
language: str | None = None,
include_answer: bool | None = None,
include_raw_content: bool | None = None,
) -> list["WebSearchResult"]:
return await self._search(
query=query,
max_results=max_results,
search_depth=search_depth,
time_range=time_range,
topic=topic,
include_domains=include_domains,
exclude_domains=exclude_domains,
language=language,
include_answer=include_answer,
include_raw_content=include_raw_content,
)
async def _crawl_urls_raw(self,
*,
urls: list[str],
extract_depth: str | None = None,
format: str | None = None,
) -> list["WebCrawlResult"]:
return await self._crawl(urls, extract_depth=extract_depth, format=format)
async def _scrape_raw(self,
*,
query: str,
max_results: int,
search_depth: str | None = None,
time_range: str | None = None,
topic: str | None = None,
include_domains: list[str] | None = None,
exclude_domains: list[str] | None = None,
language: str | None = None,
include_answer: bool | None = None,
include_raw_content: bool | None = None,
extract_depth: str | None = None,
format: str | None = None,
) -> list["WebCrawlResult"]:
search_results = await self._search(
query=query,
max_results=max_results,
search_depth=search_depth,
time_range=time_range,
topic=topic,
include_domains=include_domains,
exclude_domains=exclude_domains,
language=language,
include_answer=include_answer,
include_raw_content=include_raw_content,
)
urls = [result.url for result in search_results]
return await self._crawl(urls, extract_depth=extract_depth, format=format)
async def _search(
self,
query: str,
max_results: int,
search_depth: str | None = None,
time_range: str | None = None,
topic: str | None = None,
include_domains: list[str] | None = None,
exclude_domains: list[str] | None = None,
language: str | None = None,
include_answer: bool | None = None,
include_raw_content: bool | None = None,
) -> list[WebSearchResult]:
"""Calls the Tavily API to perform a web search."""
# Make sure max_results is within the allowed range (use cached values)
min_results = self.webSearchMinResults
max_allowed_results = self.webSearchMaxResults
if max_results < min_results or max_results > max_allowed_results:
raise ValueError(f"max_results must be between {min_results} and {max_allowed_results}")
# Perform actual API call
# Build kwargs only for provided options to avoid API rejections
kwargs: dict = {"query": query, "max_results": max_results}
if search_depth is not None:
kwargs["search_depth"] = search_depth
if time_range is not None:
kwargs["time_range"] = time_range
if topic is not None:
kwargs["topic"] = topic
if include_domains is not None:
kwargs["include_domains"] = include_domains
if exclude_domains is not None:
kwargs["exclude_domains"] = exclude_domains
if language is not None:
kwargs["language"] = language
if include_answer is not None:
kwargs["include_answer"] = include_answer
if include_raw_content is not None:
kwargs["include_raw_content"] = include_raw_content
response = await self.client.search(**kwargs)
return [
WebSearchResult(title=result["title"], url=result["url"])
for result in response["results"]
]
async def _crawl(
self,
urls: list,
extract_depth: str | None = None,
format: str | None = None,
) -> list[WebCrawlResult]:
"""Calls the Tavily API to extract text content from URLs with retry logic."""
max_retries = self.crawl_max_retries
retry_delay = self.crawl_retry_delay
timeout = self.crawl_timeout
for attempt in range(max_retries + 1):
try:
# Use asyncio.wait_for for timeout
# Build kwargs for extract
kwargs_extract: dict = {"urls": urls}
kwargs_extract["extract_depth"] = extract_depth or "advanced"
kwargs_extract["format"] = format or "text"
response = await asyncio.wait_for(
self.client.extract(**kwargs_extract),
timeout=timeout
)
return [
WebCrawlResult(url=result["url"], content=result["raw_content"])
for result in response["results"]
]
except asyncio.TimeoutError:
logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds")
if attempt < max_retries:
logger.info(f"Retrying in {retry_delay} seconds...")
await asyncio.sleep(retry_delay)
else:
raise Exception(f"Crawl failed after {max_retries + 1} attempts due to timeout")
except Exception as e:
logger.warning(f"Crawl attempt {attempt + 1} failed: {str(e)}")
if attempt < max_retries:
logger.info(f"Retrying in {retry_delay} seconds...")
await asyncio.sleep(retry_delay)
else:
raise Exception(f"Crawl failed after {max_retries + 1} attempts: {str(e)}")