gateway/modules/connectors/connectorAiTavily.py

"""Tavily web search class.
"""

import logging
import asyncio
from dataclasses import dataclass
from tavily import AsyncTavilyClient
from modules.shared.configuration import APP_CONFIG
from modules.shared.timezoneUtils import get_utc_timestamp
from modules.datamodels.datamodelWeb import (
    WebSearchActionResult,
    WebSearchActionDocument,
    WebSearchDocumentData,
    WebSearchResultItem,
    WebCrawlActionResult,
    WebCrawlActionDocument,
    WebCrawlDocumentData,
    WebCrawlResultItem,
    WebScrapeActionResult,
    WebScrapeActionDocument,
    WebSearchDocumentData as WebScrapeDocumentData,
    WebScrapeResultItem,
)


logger = logging.getLogger(__name__)

@dataclass
class WebSearchResult:
    title: str
    url: str

@dataclass
class WebCrawlResult:
    url: str
    content: str

@dataclass
class ConnectorWeb:
    client: AsyncTavilyClient = None
    # Cached settings loaded at initialization time
    crawl_timeout: int = 30
    crawl_max_retries: int = 3
    crawl_retry_delay: int = 2
    # Cached web search constraints (camelCase per project style)
    webSearchMinResults: int = 1
    webSearchMaxResults: int = 20

    @classmethod
    async def create(cls):
        api_key = APP_CONFIG.get("Connector_WebTavily_API_KEY_SECRET")
        if not api_key:
            raise ValueError("Tavily API key not configured. Please set Connector_WebTavily_API_KEY_SECRET in config.ini")
        # Load and cache web crawl related configuration
        crawl_timeout = int(APP_CONFIG.get("Web_Crawl_TIMEOUT", "30"))
        crawl_max_retries = int(APP_CONFIG.get("Web_Crawl_MAX_RETRIES", "3"))
        crawl_retry_delay = int(APP_CONFIG.get("Web_Crawl_RETRY_DELAY", "2"))
        return cls(
            client=AsyncTavilyClient(api_key=api_key),
            crawl_timeout=crawl_timeout,
            crawl_max_retries=crawl_max_retries,
            crawl_retry_delay=crawl_retry_delay,
            webSearchMinResults=int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1")),
            webSearchMaxResults=int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20")),
        )

    # Standardized methods returning ActionResults for the interface to consume
    async def search(self, request) -> "WebSearchActionResult":
        try:
            raw_results = await self._search(
                query=request.query,
                max_results=request.max_results,
                search_depth=request.search_depth,
                time_range=request.time_range,
                topic=request.topic,
                include_domains=request.include_domains,
                exclude_domains=request.exclude_domains,
                language=request.language,
                include_answer=request.include_answer,
                include_raw_content=request.include_raw_content,
            )
        except Exception as e:
            return WebSearchActionResult(success=False, error=str(e))

        result_items = [
            WebSearchResultItem(title=result.title, url=result.url)
            for result in raw_results
        ]

        document_data = WebSearchDocumentData(
            query=request.query,
            results=result_items,
            total_count=len(result_items),
        )

        document = WebSearchActionDocument(
            documentName=f"web_search_results_{get_utc_timestamp()}.json",
            documentData=document_data,
            mimeType="application/json",
        )

        return WebSearchActionResult(
            success=True, documents=[document], resultLabel="web_search_results"
        )

    async def crawl(self, request) -> "WebCrawlActionResult":
        try:
            raw_results = await self._crawl(
                [str(u) for u in request.urls],
                extract_depth=request.extract_depth,
                format=request.format,
            )
        except Exception as e:
            return WebCrawlActionResult(success=False, error=str(e))

        result_items = [
            WebCrawlResultItem(url=result.url, content=result.content)
            for result in raw_results
        ]

        document_data = WebCrawlDocumentData(
            urls=[str(u) for u in request.urls],
            results=result_items,
            total_count=len(result_items),
        )

        document = WebCrawlActionDocument(
            documentName=f"web_crawl_results_{get_utc_timestamp()}.json",
            documentData=document_data,
            mimeType="application/json",
        )

        return WebCrawlActionResult(
            success=True, documents=[document], resultLabel="web_crawl_results"
        )

    async def scrape(self, request) -> "WebScrapeActionResult":
        try:
            search_results = await self._search(
                query=request.query,
                max_results=request.max_results,
                search_depth=request.search_depth,
                time_range=request.time_range,
                topic=request.topic,
                include_domains=request.include_domains,
                exclude_domains=request.exclude_domains,
                language=request.language,
                include_answer=request.include_answer,
                include_raw_content=request.include_raw_content,
            )
        except Exception as e:
            return WebScrapeActionResult(success=False, error=str(e))

        try:
            urls = [result.url for result in search_results]
            crawl_results = await self._crawl(
                urls,
                extract_depth=request.extract_depth,
                format=request.format,
            )
        except Exception as e:
            return WebScrapeActionResult(success=False, error=str(e))

        result_items = [
            WebScrapeResultItem(url=result.url, content=result.content)
            for result in crawl_results
        ]

        document_data = WebScrapeDocumentData(
            query=request.query,
            results=result_items,
            total_count=len(result_items),
        )

        document = WebScrapeActionDocument(
            documentName=f"web_scrape_results_{get_utc_timestamp()}.json",
            documentData=document_data,
            mimeType="application/json",
        )

        return WebScrapeActionResult(
            success=True, documents=[document], resultLabel="web_scrape_results"
        )

    async def _search_urls_raw(self,
        *,
        query: str,
        max_results: int,
        search_depth: str | None = None,
        time_range: str | None = None,
        topic: str | None = None,
        include_domains: list[str] | None = None,
        exclude_domains: list[str] | None = None,
        language: str | None = None,
        include_answer: bool | None = None,
        include_raw_content: bool | None = None,
    ) -> list["WebSearchResult"]:
        return await self._search(
            query=query,
            max_results=max_results,
            search_depth=search_depth,
            time_range=time_range,
            topic=topic,
            include_domains=include_domains,
            exclude_domains=exclude_domains,
            language=language,
            include_answer=include_answer,
            include_raw_content=include_raw_content,
        )

    async def _crawl_urls_raw(self,
        *,
        urls: list[str],
        extract_depth: str | None = None,
        format: str | None = None,
    ) -> list["WebCrawlResult"]:
        return await self._crawl(urls, extract_depth=extract_depth, format=format)

    async def _scrape_raw(self,
        *,
        query: str,
        max_results: int,
        search_depth: str | None = None,
        time_range: str | None = None,
        topic: str | None = None,
        include_domains: list[str] | None = None,
        exclude_domains: list[str] | None = None,
        language: str | None = None,
        include_answer: bool | None = None,
        include_raw_content: bool | None = None,
        extract_depth: str | None = None,
        format: str | None = None,
    ) -> list["WebCrawlResult"]:
        search_results = await self._search(
            query=query,
            max_results=max_results,
            search_depth=search_depth,
            time_range=time_range,
            topic=topic,
            include_domains=include_domains,
            exclude_domains=exclude_domains,
            language=language,
            include_answer=include_answer,
            include_raw_content=include_raw_content,
        )
        urls = [result.url for result in search_results]
        return await self._crawl(urls, extract_depth=extract_depth, format=format)

    async def _search(
        self,
        query: str,
        max_results: int,
        search_depth: str | None = None,
        time_range: str | None = None,
        topic: str | None = None,
        include_domains: list[str] | None = None,
        exclude_domains: list[str] | None = None,
        language: str | None = None,
        include_answer: bool | None = None,
        include_raw_content: bool | None = None,
    ) -> list[WebSearchResult]:
        """Calls the Tavily API to perform a web search."""
        # Make sure max_results is within the allowed range (use cached values)
        min_results = self.webSearchMinResults
        max_allowed_results = self.webSearchMaxResults
        if max_results < min_results or max_results > max_allowed_results:
            raise ValueError(f"max_results must be between {min_results} and {max_allowed_results}")

        # Perform actual API call
        # Build kwargs only for provided options to avoid API rejections
        kwargs: dict = {"query": query, "max_results": max_results}
        if search_depth is not None:
            kwargs["search_depth"] = search_depth
        if time_range is not None:
            kwargs["time_range"] = time_range
        if topic is not None:
            kwargs["topic"] = topic
        if include_domains is not None:
            kwargs["include_domains"] = include_domains
        if exclude_domains is not None:
            kwargs["exclude_domains"] = exclude_domains
        if language is not None:
            kwargs["language"] = language
        if include_answer is not None:
            kwargs["include_answer"] = include_answer
        if include_raw_content is not None:
            kwargs["include_raw_content"] = include_raw_content

        response = await self.client.search(**kwargs)

        return [
            WebSearchResult(title=result["title"], url=result["url"])
            for result in response["results"]
        ]

    async def _crawl(
        self,
        urls: list,
        extract_depth: str | None = None,
        format: str | None = None,
    ) -> list[WebCrawlResult]:
        """Calls the Tavily API to extract text content from URLs with retry logic."""
        max_retries = self.crawl_max_retries
        retry_delay = self.crawl_retry_delay
        timeout = self.crawl_timeout

        for attempt in range(max_retries + 1):
            try:
                # Use asyncio.wait_for for timeout
                # Build kwargs for extract
                kwargs_extract: dict = {"urls": urls}
                kwargs_extract["extract_depth"] = extract_depth or "advanced"
                kwargs_extract["format"] = format or "text"

                response = await asyncio.wait_for(
                    self.client.extract(**kwargs_extract),
                    timeout=timeout
                )

                return [
                    WebCrawlResult(url=result["url"], content=result["raw_content"])
                    for result in response["results"]
                ]

            except asyncio.TimeoutError:
                logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds")
                if attempt < max_retries:
                    logger.info(f"Retrying in {retry_delay} seconds...")
                    await asyncio.sleep(retry_delay)
                else:
                    raise Exception(f"Crawl failed after {max_retries + 1} attempts due to timeout")

            except Exception as e:
                logger.warning(f"Crawl attempt {attempt + 1} failed: {str(e)}")
                if attempt < max_retries:
                    logger.info(f"Retrying in {retry_delay} seconds...")
                    await asyncio.sleep(retry_delay)
                else:
                    raise Exception(f"Crawl failed after {max_retries + 1} attempts: {str(e)}")