gateway/modules/aicore/aicorePluginTavily.py

"""Tavily web search class.
"""

import logging
import asyncio
import re
from dataclasses import dataclass
from typing import Optional, List, Dict
from tavily import AsyncTavilyClient
from modules.shared.configuration import APP_CONFIG
from modules.aicore.aicoreBase import BaseConnectorAi
from modules.datamodels.datamodelAi import AiModel, PriorityEnum, ProcessingModeEnum, OperationTypeEnum, AiModelCall, AiModelResponse, createOperationTypeRatings, AiCallPromptWebSearch, AiCallPromptWebCrawl
from modules.datamodels.datamodelTools import CountryCodes


logger = logging.getLogger(__name__)

@dataclass
class WebSearchResult:
    title: str
    url: str
    rawContent: Optional[str] = None

@dataclass
class WebCrawlResult:
    url: str
    content: str
    title: Optional[str] = None


class AiTavily(BaseConnectorAi):
    """Tavily web search connector."""

    def __init__(self):
        super().__init__()
        self.client: Optional[AsyncTavilyClient] = None
        # Cached settings loaded at initialization time
        self.crawlTimeout: int = 30
        self.crawlMaxRetries: int = 3
        self.crawlRetryDelay: int = 2
        # Cached web search constraints (camelCase per project style)
        self.webSearchMinResults: int = 1
        self.webSearchMaxResults: int = 20
        # Initialize client if API key is available
        self._initializeClient()


    def getModels(self) -> List[AiModel]:
        """Get all available Tavily models."""
        return [
            AiModel(
                name="tavily-search",
                displayName="Tavily Search & Research",
                connectorType="tavily",
                apiUrl="https://api.tavily.com",
                temperature=0.0,  # Web search doesn't use temperature
                maxTokens=0,  # Web search doesn't use tokens
                contextLength=0,
                costPer1kTokensInput=0.0,
                costPer1kTokensOutput=0.0,
                speedRating=8,  # Good speed for search and extract
                qualityRating=9,  # Excellent quality for web research
                # capabilities removed (not used in business logic)
                functionCall=self._routeWebOperation,
                priority=PriorityEnum.BALANCED,
                processingMode=ProcessingModeEnum.BASIC,
                operationTypes=createOperationTypeRatings(
                    (OperationTypeEnum.WEB_SEARCH, 9),
                    (OperationTypeEnum.WEB_CRAWL, 10)
                ),
                version="tavily-search",
                calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008  # Simple flat rate
            )
        ]

    def _initializeClient(self):
        """Initialize the Tavily client if API key is available."""
        try:
            apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
            if apiKey:
                self.client = AsyncTavilyClient(api_key=apiKey)
                logger.info("Tavily client initialized successfully")
            else:
                logger.warning("Tavily API key not found, client not initialized")
        except Exception as e:
            logger.error(f"Failed to initialize Tavily client: {str(e)}")

    def getConnectorType(self) -> str:
        """Get the connector type identifier."""
        return "tavily"

    def _convertIsoCodeToCountryName(self, isoCode: str) -> str:
        """
        Convert ISO-2 country code to Tavily country name.
        Uses centralized CountryCodes mapping.
        """
        return CountryCodes.getForTavily(isoCode)

    def _extractUrlsFromPrompt(self, prompt: str) -> List[str]:
        """Extract URLs from a text prompt using regex."""
        if not prompt:
            return []

        # URL regex pattern - matches http/https URLs
        urlPattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?'
        urls = re.findall(urlPattern, prompt)

        # Remove duplicates while preserving order
        seen = set()
        uniqueUrls = []
        for url in urls:
            if url not in seen:
                seen.add(url)
                uniqueUrls.append(url)

        return uniqueUrls

    def _normalizeUrl(self, url: str) -> str:
        """
        Normalize URL for better deduplication.
        Removes common variations that represent the same content.
        """
        if not url:
            return url

        # Remove trailing slashes
        url = url.rstrip('/')

        # Remove common query parameters that don't affect content
        import urllib.parse
        parsed = urllib.parse.urlparse(url)

        # Remove common tracking parameters
        queryParams = urllib.parse.parse_qs(parsed.query)
        filteredParams = {}

        for key, values in queryParams.items():
            # Keep important parameters, remove tracking ones
            if key.lower() not in ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
                                 'fbclid', 'gclid', 'ref', 'source', 'campaign']:
                filteredParams[key] = values

        # Rebuild query string
        filteredQuery = urllib.parse.urlencode(filteredParams, doseq=True)

        # Reconstruct URL
        normalized = urllib.parse.urlunparse((
            parsed.scheme,
            parsed.netloc,
            parsed.path,
            parsed.params,
            filteredQuery,
            parsed.fragment
        ))

        return normalized

    def _calculateRelevanceScore(self, result: WebSearchResult, queryWords: set) -> float:
        """
        Calculate relevance score for a search result.
        Higher score means more relevant to the query.
        """
        score = 0.0

        # Title relevance (most important)
        titleWords = set(result.title.lower().split())
        titleMatches = len(queryWords.intersection(titleWords))
        score += titleMatches * 3.0  # Weight title matches heavily

        # URL relevance
        urlWords = set(result.url.lower().split('/'))
        urlMatches = len(queryWords.intersection(urlWords))
        score += urlMatches * 1.5

        # Content relevance (if available)
        if hasattr(result, 'rawContent') and result.rawContent:
            contentWords = set(result.rawContent.lower().split())
            contentMatches = len(queryWords.intersection(contentWords))
            score += contentMatches * 0.1  # Lower weight for content matches

        # Domain authority bonus (simple heuristic)
        domain = result.url.split('/')[2] if '/' in result.url else result.url
        if any(authDomain in domain.lower() for authDomain in
               ['wikipedia.org', 'github.com', 'stackoverflow.com', 'reddit.com', 'medium.com']):
            score += 1.0

        # Penalty for very long URLs (often less relevant)
        if len(result.url) > 100:
            score -= 0.5

        return score

    def _intelligentUrlFiltering(self, searchResults: List[WebSearchResult], query: str, maxResults: int) -> List[WebSearchResult]:
        """
        Intelligent URL filtering with de-duplication and relevance scoring.

        Args:
            searchResults: Raw search results from Tavily
            query: Original search query for relevance scoring
            maxResults: Maximum number of results to return

        Returns:
            Filtered and deduplicated list of search results
        """
        if not searchResults:
            return []

        # Step 1: Basic de-duplication by URL
        seenUrls = set()
        uniqueResults = []

        for result in searchResults:
            # Normalize URL for better deduplication
            normalizedUrl = self._normalizeUrl(result.url)
            if normalizedUrl not in seenUrls:
                seenUrls.add(normalizedUrl)
                uniqueResults.append(result)

        logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original")

        # Step 2: Relevance scoring and filtering
        scoredResults = []
        queryWords = set(query.lower().split())

        for result in uniqueResults:
            score = self._calculateRelevanceScore(result, queryWords)
            scoredResults.append((score, result))

        # Step 3: Sort by relevance score (higher is better)
        scoredResults.sort(key=lambda x: x[0], reverse=True)

        # Step 4: Take top results
        filteredResults = [result for score, result in scoredResults[:maxResults]]

        logger.info(f"After intelligent filtering: {len(filteredResults)} results selected from {len(uniqueResults)} unique")

        return filteredResults

    @classmethod
    async def create(cls):
        apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
        if not apiKey:
            raise ValueError("Tavily API key not configured. Please set Connector_AiTavily_API_SECRET in config.ini")
        # Load and cache web crawl related configuration
        crawlTimeout = int(APP_CONFIG.get("Web_Crawl_TIMEOUT", "30"))
        crawlMaxRetries = int(APP_CONFIG.get("Web_Crawl_MAX_RETRIES", "3"))
        crawlRetryDelay = int(APP_CONFIG.get("Web_Crawl_RETRY_DELAY", "2"))
        return cls(
            client=AsyncTavilyClient(api_key=apiKey),
            crawlTimeout=crawlTimeout,
            crawlMaxRetries=crawlMaxRetries,
            crawlRetryDelay=crawlRetryDelay,
            webSearchMinResults=int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1")),
            webSearchMaxResults=int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20")),
        )

    # Standardized method using AiModelCall/AiModelResponse pattern


    def _cleanUrl(self, url: str) -> str:
        """Clean URL by removing extra text that might be appended."""
        import re
        # Extract just the URL part, removing any extra text after it
        urlMatch = re.match(r'(https?://[^\s,]+)', url)
        if urlMatch:
            return urlMatch.group(1)
        return url

    async def _search(
        self,
        query: str,
        maxResults: int,
        searchDepth: str | None = None,
        timeRange: str | None = None,
        topic: str | None = None,
        includeDomains: list[str] | None = None,
        excludeDomains: list[str] | None = None,
        country: str | None = None,
        includeAnswer: str | None = None,
        includeRawContent: str | None = None,
    ) -> list[WebSearchResult]:
        """Calls the Tavily API to perform a web search."""
        # Make sure maxResults is within the allowed range (use cached values)
        minResults = self.webSearchMinResults
        maxAllowedResults = self.webSearchMaxResults
        if maxResults < minResults or maxResults > maxAllowedResults:
            raise ValueError(f"maxResults must be between {minResults} and {maxAllowedResults}")

        # Perform actual API call
        # Build kwargs only for provided options to avoid API rejections
        kwargs: dict = {"query": query, "max_results": maxResults}
        if searchDepth is not None:
            kwargs["search_depth"] = searchDepth
        if timeRange is not None:
            kwargs["time_range"] = timeRange
        if topic is not None:
            kwargs["topic"] = topic
        if includeDomains is not None and len(includeDomains) > 0:
            kwargs["include_domains"] = includeDomains
        if excludeDomains is not None:
            kwargs["exclude_domains"] = excludeDomains
        if country is not None:
            kwargs["country"] = country
        if includeAnswer is not None:
            kwargs["include_answer"] = includeAnswer
        if includeRawContent is not None:
            kwargs["include_raw_content"] = includeRawContent

        # Log the final API call parameters for comparison
        logger.info(f"Tavily API call parameters: {kwargs}")

        # Ensure client is initialized
        if self.client is None:
            self._initializeClient()
            if self.client is None:
                raise ValueError("Tavily client not initialized. Please check API key configuration.")

        response = await self.client.search(**kwargs)

        # Return all results without score filtering
        # Tavily's scoring is already applied by the API
        logger.info(f"Tavily returned {len(response.get('results', []))} results")

        return [
            WebSearchResult(
                title=result["title"],
                url=self._cleanUrl(result["url"]),
                rawContent=result.get("raw_content")
            )
            for result in response["results"]
        ]

    async def _crawl(
        self,
        url: str,
        instructions: str | None = None,
        limit: int = 20,
        maxDepth: int = 2,
        maxBreadth: int = 40,
    ) -> list[WebCrawlResult]:
        """Calls the Tavily API to crawl ONE URL with link following and retry logic."""
        maxRetries = self.crawlMaxRetries
        retryDelay = self.crawlRetryDelay
        timeout = self.crawlTimeout

        logger.debug(f"Starting crawl of URL: {url}")
        logger.debug(f"Crawl settings: instructions={instructions}, limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s")

        for attempt in range(maxRetries + 1):
            try:
                logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}")

                # Ensure client is initialized
                if self.client is None:
                    self._initializeClient()
                    if self.client is None:
                        raise ValueError("Tavily client not initialized. Please check API key configuration.")

                logger.debug(f"Crawling URL: {url}")

                # Build kwargs for crawl
                kwargsCrawl: dict = {"url": url}
                if instructions:
                    kwargsCrawl["instructions"] = instructions
                if limit:
                    kwargsCrawl["limit"] = limit
                if maxDepth:
                    kwargsCrawl["max_depth"] = maxDepth
                if maxBreadth:
                    kwargsCrawl["max_breadth"] = maxBreadth

                logger.debug(f"Sending request to Tavily with kwargs: {kwargsCrawl}")

                response = await asyncio.wait_for(
                    self.client.crawl(**kwargsCrawl),
                    timeout=timeout
                )

                logger.debug(f"Tavily response received: {type(response)}")

                # Parse response - could be dict with results or list
                if isinstance(response, dict) and "results" in response:
                    pageResults = response["results"]
                elif isinstance(response, list):
                    pageResults = response
                else:
                    logger.warning(f"Unexpected response format: {type(response)}")
                    pageResults = []

                logger.debug(f"Got {len(pageResults)} pages from crawl")

                # Convert to WebCrawlResult format
                results = []
                for result in pageResults:
                    results.append(WebCrawlResult(
                        url=result.get("url", url),
                        content=result.get("raw_content", result.get("content", "")),
                        title=result.get("title", "")
                    ))

                logger.debug(f"Crawl successful: extracted {len(results)} pages from URL")
                return results

            except asyncio.TimeoutError:
                logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URL: {url}")
                if attempt < maxRetries:
                    logger.info(f"Retrying in {retryDelay} seconds...")
                    await asyncio.sleep(retryDelay)
                else:
                    raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout")

            except Exception as e:
                logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}")
                logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")

                # Check if it's a validation error and log more details
                if "validation" in str(e).lower():
                    logger.debug(f"URL validation failed. Checking URL format:")
                    logger.debug(f"  URL: '{url}' (length: {len(url)})")
                    # Check for common URL issues
                    if ' ' in url:
                        logger.debug(f"    WARNING: URL contains spaces!")
                    if not url.startswith(('http://', 'https://')):
                        logger.debug(f"    WARNING: URL doesn't start with http/https!")
                    if len(url) > 2000:
                        logger.debug(f"    WARNING: URL is very long ({len(url)} chars)")

                if attempt < maxRetries:
                    logger.info(f"Retrying in {retryDelay} seconds...")
                    await asyncio.sleep(retryDelay)
                else:
                    raise Exception(f"Crawl failed after {maxRetries + 1} attempts: {str(e)}")

    async def _routeWebOperation(self, modelCall: AiModelCall) -> "AiModelResponse":
        """
        Route web operation based on operation type.

        Args:
            modelCall: AiModelCall with messages and options

        Returns:
            AiModelResponse based on operation type
        """
        operationType = modelCall.options.operationType

        if operationType == OperationTypeEnum.WEB_SEARCH:
            return await self.webSearch(modelCall)
        elif operationType == OperationTypeEnum.WEB_CRAWL:
            return await self.webCrawl(modelCall)
        else:
            # Unsupported operation type
            return AiModelResponse(
                content="",
                success=False,
                error=f"Unsupported operation type: {operationType}"
            )

    async def webSearch(self, modelCall: AiModelCall) -> "AiModelResponse":
        """
        WEB_SEARCH operation - returns list of URLs using Tavily search.

        Args:
            modelCall: AiModelCall with AiCallPromptWebSearch as prompt

        Returns:
            AiModelResponse with JSON list of URLs
        """
        try:
            # Extract parameters - find user message (not system message)
            promptContent = ""
            if modelCall.messages:
                for msg in modelCall.messages:
                    if msg.get("role") == "user":
                        promptContent = msg.get("content", "")
                        break
                # Fallback to first message if no user message found
                if not promptContent and len(modelCall.messages) > 0:
                    promptContent = modelCall.messages[0].get("content", "")

            if not promptContent or not promptContent.strip():
                raise ValueError("Empty prompt content received for web search")

            import json
            try:
                promptData = json.loads(promptContent)
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse prompt content as JSON: {promptContent[:200]}")
                raise ValueError(f"Invalid JSON in prompt content: {str(e)}")

            # Create Pydantic model
            webSearchPrompt = AiCallPromptWebSearch(**promptData)

            # Convert ISO country code to country name for Tavily
            countryName = webSearchPrompt.country
            if countryName:
                countryName = self._convertIsoCodeToCountryName(countryName)

            # Perform search - use exact parameters from prompt
            # NOTE: timeRange parameter causes generic results, so we don't use it
            searchResults = await self._search(
                query=webSearchPrompt.instruction,
                maxResults=webSearchPrompt.maxNumberPages,
                timeRange=None,  # Not used - causes generic results
                country=countryName,
                includeAnswer="basic",
                includeRawContent="text"
            )

            # Extract URLs from results
            urls = [result.url for result in searchResults]

            # Return as JSON array
            import json
            return AiModelResponse(
                content=json.dumps(urls, indent=2),
                success=True,
                metadata={"total_urls": len(urls), "operation": "WEB_SEARCH"}
            )

        except Exception as e:
            logger.error(f"Error in Tavily web search: {str(e)}")
            return AiModelResponse(
                content="[]",
                success=False,
                error=str(e)
            )

    async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse":
        """
        WEB_CRAWL operation - crawls one URL using Tavily with link following.

        Args:
            modelCall: AiModelCall with AiCallPromptWebCrawl as prompt

        Returns:
            AiModelResponse with crawl results as JSON (may include multiple pages)
        """
        try:
            # Extract parameters - find user message (not system message)
            promptContent = ""
            if modelCall.messages:
                for msg in modelCall.messages:
                    if msg.get("role") == "user":
                        promptContent = msg.get("content", "")
                        break
                # Fallback to first message if no user message found
                if not promptContent and len(modelCall.messages) > 0:
                    promptContent = modelCall.messages[0].get("content", "")

            if not promptContent or not promptContent.strip():
                raise ValueError("Empty prompt content received for web crawl")

            import json
            try:
                promptData = json.loads(promptContent)
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse prompt content as JSON: {promptContent[:200]}")
                raise ValueError(f"Invalid JSON in prompt content: {str(e)}")

            # Create Pydantic model
            webCrawlPrompt = AiCallPromptWebCrawl(**promptData)

            # Perform crawl for ONE URL with link following
            # Use maxWidth as limit, maxDepth as maxDepth, and calculate maxBreadth
            crawlResults = await self._crawl(
                url=webCrawlPrompt.url,
                instructions=webCrawlPrompt.instruction,
                limit=webCrawlPrompt.maxWidth or 20,  # maxWidth controls number of pages
                maxDepth=webCrawlPrompt.maxDepth or 2,
                maxBreadth=webCrawlPrompt.maxWidth or 40  # Use same as limit for breadth
            )

            # If we got multiple pages from the crawl, we need to format them differently
            # Return the first result for backwards compatibility, but include total page count
            if crawlResults and len(crawlResults) > 0:
                # Get all pages content
                allContent = ""
                for i, result in enumerate(crawlResults, 1):
                    pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n"
                    if result.title:
                        allContent += f"{pageHeader}Title: {result.title}\n\n"
                    allContent += f"{result.content}\n"

                resultData = {
                    "url": webCrawlPrompt.url,
                    "title": crawlResults[0].title if crawlResults[0].title else "Content",
                    "content": allContent,
                    "pagesCrawled": len(crawlResults),
                    "pageUrls": [result.url for result in crawlResults]
                }
            else:
                resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0}

            # Return as JSON - same format as Perplexity but with multiple pages content
            import json
            return AiModelResponse(
                content=json.dumps(resultData, indent=2),
                success=True,
                metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url, "pagesCrawled": len(crawlResults) if crawlResults else 0}
            )

        except Exception as e:
            logger.error(f"Error in Tavily web crawl: {str(e)}")
            import json
            errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""}
            return AiModelResponse(
                content=json.dumps(errorResult, indent=2),
                success=False,
                error=str(e)
            )