"""Tavily web search class. """ import logging import asyncio from dataclasses import dataclass from tavily import AsyncTavilyClient from modules.shared.configuration import APP_CONFIG from modules.shared.timezoneUtils import get_utc_timestamp from modules.datamodels.datamodelWeb import ( WebSearchActionResult, WebSearchActionDocument, WebSearchDocumentData, WebSearchResultItem, WebCrawlActionResult, WebCrawlActionDocument, WebCrawlDocumentData, WebCrawlResultItem, WebScrapeActionResult, WebScrapeActionDocument, WebSearchDocumentData as WebScrapeDocumentData, WebScrapeResultItem, ) logger = logging.getLogger(__name__) @dataclass class WebSearchResult: title: str url: str @dataclass class WebCrawlResult: url: str content: str @dataclass class ConnectorWeb: client: AsyncTavilyClient = None # Cached settings loaded at initialization time crawl_timeout: int = 30 crawl_max_retries: int = 3 crawl_retry_delay: int = 2 # Cached web search constraints (camelCase per project style) webSearchMinResults: int = 1 webSearchMaxResults: int = 20 @classmethod async def create(cls): api_key = APP_CONFIG.get("Connector_WebTavily_API_KEY_SECRET") if not api_key: raise ValueError("Tavily API key not configured. Please set Connector_WebTavily_API_KEY_SECRET in config.ini") # Load and cache web crawl related configuration crawl_timeout = int(APP_CONFIG.get("Web_Crawl_TIMEOUT", "30")) crawl_max_retries = int(APP_CONFIG.get("Web_Crawl_MAX_RETRIES", "3")) crawl_retry_delay = int(APP_CONFIG.get("Web_Crawl_RETRY_DELAY", "2")) return cls( client=AsyncTavilyClient(api_key=api_key), crawl_timeout=crawl_timeout, crawl_max_retries=crawl_max_retries, crawl_retry_delay=crawl_retry_delay, webSearchMinResults=int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1")), webSearchMaxResults=int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20")), ) # Standardized methods returning ActionResults for the interface to consume async def search(self, request) -> "WebSearchActionResult": try: raw_results = await self._search( query=request.query, max_results=request.max_results, search_depth=request.search_depth, time_range=request.time_range, topic=request.topic, include_domains=request.include_domains, exclude_domains=request.exclude_domains, language=request.language, include_answer=request.include_answer, include_raw_content=request.include_raw_content, ) except Exception as e: return WebSearchActionResult(success=False, error=str(e)) result_items = [ WebSearchResultItem(title=result.title, url=result.url) for result in raw_results ] document_data = WebSearchDocumentData( query=request.query, results=result_items, total_count=len(result_items), ) document = WebSearchActionDocument( documentName=f"web_search_results_{get_utc_timestamp()}.json", documentData=document_data, mimeType="application/json", ) return WebSearchActionResult( success=True, documents=[document], resultLabel="web_search_results" ) async def crawl(self, request) -> "WebCrawlActionResult": try: raw_results = await self._crawl( [str(u) for u in request.urls], extract_depth=request.extract_depth, format=request.format, ) except Exception as e: return WebCrawlActionResult(success=False, error=str(e)) result_items = [ WebCrawlResultItem(url=result.url, content=result.content) for result in raw_results ] document_data = WebCrawlDocumentData( urls=[str(u) for u in request.urls], results=result_items, total_count=len(result_items), ) document = WebCrawlActionDocument( documentName=f"web_crawl_results_{get_utc_timestamp()}.json", documentData=document_data, mimeType="application/json", ) return WebCrawlActionResult( success=True, documents=[document], resultLabel="web_crawl_results" ) async def scrape(self, request) -> "WebScrapeActionResult": try: search_results = await self._search( query=request.query, max_results=request.max_results, search_depth=request.search_depth, time_range=request.time_range, topic=request.topic, include_domains=request.include_domains, exclude_domains=request.exclude_domains, language=request.language, include_answer=request.include_answer, include_raw_content=request.include_raw_content, ) except Exception as e: return WebScrapeActionResult(success=False, error=str(e)) try: urls = [result.url for result in search_results] crawl_results = await self._crawl( urls, extract_depth=request.extract_depth, format=request.format, ) except Exception as e: return WebScrapeActionResult(success=False, error=str(e)) result_items = [ WebScrapeResultItem(url=result.url, content=result.content) for result in crawl_results ] document_data = WebScrapeDocumentData( query=request.query, results=result_items, total_count=len(result_items), ) document = WebScrapeActionDocument( documentName=f"web_scrape_results_{get_utc_timestamp()}.json", documentData=document_data, mimeType="application/json", ) return WebScrapeActionResult( success=True, documents=[document], resultLabel="web_scrape_results" ) async def _search_urls_raw(self, *, query: str, max_results: int, search_depth: str | None = None, time_range: str | None = None, topic: str | None = None, include_domains: list[str] | None = None, exclude_domains: list[str] | None = None, language: str | None = None, include_answer: bool | None = None, include_raw_content: bool | None = None, ) -> list["WebSearchResult"]: return await self._search( query=query, max_results=max_results, search_depth=search_depth, time_range=time_range, topic=topic, include_domains=include_domains, exclude_domains=exclude_domains, language=language, include_answer=include_answer, include_raw_content=include_raw_content, ) async def _crawl_urls_raw(self, *, urls: list[str], extract_depth: str | None = None, format: str | None = None, ) -> list["WebCrawlResult"]: return await self._crawl(urls, extract_depth=extract_depth, format=format) async def _scrape_raw(self, *, query: str, max_results: int, search_depth: str | None = None, time_range: str | None = None, topic: str | None = None, include_domains: list[str] | None = None, exclude_domains: list[str] | None = None, language: str | None = None, include_answer: bool | None = None, include_raw_content: bool | None = None, extract_depth: str | None = None, format: str | None = None, ) -> list["WebCrawlResult"]: search_results = await self._search( query=query, max_results=max_results, search_depth=search_depth, time_range=time_range, topic=topic, include_domains=include_domains, exclude_domains=exclude_domains, language=language, include_answer=include_answer, include_raw_content=include_raw_content, ) urls = [result.url for result in search_results] return await self._crawl(urls, extract_depth=extract_depth, format=format) async def _search( self, query: str, max_results: int, search_depth: str | None = None, time_range: str | None = None, topic: str | None = None, include_domains: list[str] | None = None, exclude_domains: list[str] | None = None, language: str | None = None, include_answer: bool | None = None, include_raw_content: bool | None = None, ) -> list[WebSearchResult]: """Calls the Tavily API to perform a web search.""" # Make sure max_results is within the allowed range (use cached values) min_results = self.webSearchMinResults max_allowed_results = self.webSearchMaxResults if max_results < min_results or max_results > max_allowed_results: raise ValueError(f"max_results must be between {min_results} and {max_allowed_results}") # Perform actual API call # Build kwargs only for provided options to avoid API rejections kwargs: dict = {"query": query, "max_results": max_results} if search_depth is not None: kwargs["search_depth"] = search_depth if time_range is not None: kwargs["time_range"] = time_range if topic is not None: kwargs["topic"] = topic if include_domains is not None: kwargs["include_domains"] = include_domains if exclude_domains is not None: kwargs["exclude_domains"] = exclude_domains if language is not None: kwargs["language"] = language if include_answer is not None: kwargs["include_answer"] = include_answer if include_raw_content is not None: kwargs["include_raw_content"] = include_raw_content response = await self.client.search(**kwargs) return [ WebSearchResult(title=result["title"], url=result["url"]) for result in response["results"] ] async def _crawl( self, urls: list, extract_depth: str | None = None, format: str | None = None, ) -> list[WebCrawlResult]: """Calls the Tavily API to extract text content from URLs with retry logic.""" max_retries = self.crawl_max_retries retry_delay = self.crawl_retry_delay timeout = self.crawl_timeout for attempt in range(max_retries + 1): try: # Use asyncio.wait_for for timeout # Build kwargs for extract kwargs_extract: dict = {"urls": urls} kwargs_extract["extract_depth"] = extract_depth or "advanced" kwargs_extract["format"] = format or "text" response = await asyncio.wait_for( self.client.extract(**kwargs_extract), timeout=timeout ) return [ WebCrawlResult(url=result["url"], content=result["raw_content"]) for result in response["results"] ] except asyncio.TimeoutError: logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds") if attempt < max_retries: logger.info(f"Retrying in {retry_delay} seconds...") await asyncio.sleep(retry_delay) else: raise Exception(f"Crawl failed after {max_retries + 1} attempts due to timeout") except Exception as e: logger.warning(f"Crawl attempt {attempt + 1} failed: {str(e)}") if attempt < max_retries: logger.info(f"Retrying in {retry_delay} seconds...") await asyncio.sleep(retry_delay) else: raise Exception(f"Crawl failed after {max_retries + 1} attempts: {str(e)}")