# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Tavily web search class.
"""

import logging
import asyncio
import re
from dataclasses import dataclass
from typing import Optional, List, Dict
from tavily import AsyncTavilyClient
from modules.shared.configuration import APP_CONFIG
from modules.aicore.aicoreBase import BaseConnectorAi
from modules.datamodels.datamodelAi import AiModel, PriorityEnum, ProcessingModeEnum, OperationTypeEnum, AiModelCall, AiModelResponse, createOperationTypeRatings, AiCallPromptWebSearch, AiCallPromptWebCrawl
from modules.datamodels.datamodelTools import CountryCodes


logger = logging.getLogger(__name__)

@dataclass
class WebSearchResult:
    title: str
    url: str
    rawContent: Optional[str] = None

@dataclass
class WebCrawlResult:
    url: str
    content: str
    title: Optional[str] = None


class AiTavily(BaseConnectorAi):
    """Tavily web search connector."""
    
    def __init__(self):
        super().__init__()
        self.client: Optional[AsyncTavilyClient] = None
        # Cached settings loaded at initialization time
        self.crawlTimeout: int = 30
        self.crawlMaxRetries: int = 3
        self.crawlRetryDelay: int = 2
        # Cached web search constraints (camelCase per project style)
        self.webSearchMinResults: int = 1
        self.webSearchMaxResults: int = 20
        # Initialize client if API key is available
        self._initializeClient()


    def getModels(self) -> List[AiModel]:
        """Get all available Tavily models."""
        return [
            AiModel(
                name="tavily-search",
                displayName="Tavily Search & Research",
                connectorType="tavily",
                apiUrl="https://api.tavily.com",
                temperature=0.0,  # Web search doesn't use temperature
                maxTokens=0,  # Web search doesn't use tokens
                contextLength=0,
                costPer1kTokensInput=0.0,
                costPer1kTokensOutput=0.0,
                speedRating=8,  # Good speed for search and extract
                qualityRating=9,  # Excellent quality for web research
                # capabilities removed (not used in business logic)
                functionCall=self._routeWebOperation,
                priority=PriorityEnum.BALANCED,
                processingMode=ProcessingModeEnum.BASIC,
                operationTypes=createOperationTypeRatings(
                    (OperationTypeEnum.WEB_SEARCH, 9),
                    (OperationTypeEnum.WEB_CRAWL, 10)
                ),
                version="tavily-search",
                calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008  # Simple flat rate
            )
        ]

    def _initializeClient(self):
        """Initialize the Tavily client if API key is available."""
        try:
            apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
            if apiKey:
                self.client = AsyncTavilyClient(api_key=apiKey)
                logger.info("Tavily client initialized successfully")
            else:
                logger.warning("Tavily API key not found, client not initialized")
        except Exception as e:
            logger.error(f"Failed to initialize Tavily client: {str(e)}")
    
    def getConnectorType(self) -> str:
        """Get the connector type identifier."""
        return "tavily"
    
    def _convertIsoCodeToCountryName(self, isoCode: str) -> str:
        """
        Convert ISO-2 country code to Tavily country name.
        Uses centralized CountryCodes mapping.
        """
        return CountryCodes.getForTavily(isoCode)
    
    def _extractUrlsFromPrompt(self, prompt: str) -> List[str]:
        """Extract URLs from a text prompt using regex."""
        if not prompt:
            return []
        
        # URL regex pattern - matches http/https URLs
        urlPattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?'
        urls = re.findall(urlPattern, prompt)
        
        # Remove duplicates while preserving order
        seen = set()
        uniqueUrls = []
        for url in urls:
            if url not in seen:
                seen.add(url)
                uniqueUrls.append(url)
        
        return uniqueUrls
    
    def _normalizeUrl(self, url: str) -> str:
        """
        Normalize URL for better deduplication.
        Removes common variations that represent the same content.
        """
        if not url:
            return url
        
        # Remove trailing slashes
        url = url.rstrip('/')
        
        # Remove common query parameters that don't affect content
        import urllib.parse
        parsed = urllib.parse.urlparse(url)
        
        # Remove common tracking parameters
        queryParams = urllib.parse.parse_qs(parsed.query)
        filteredParams = {}
        
        for key, values in queryParams.items():
            # Keep important parameters, remove tracking ones
            if key.lower() not in ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 
                                 'fbclid', 'gclid', 'ref', 'source', 'campaign']:
                filteredParams[key] = values
        
        # Rebuild query string
        filteredQuery = urllib.parse.urlencode(filteredParams, doseq=True)
        
        # Reconstruct URL
        normalized = urllib.parse.urlunparse((
            parsed.scheme,
            parsed.netloc,
            parsed.path,
            parsed.params,
            filteredQuery,
            parsed.fragment
        ))
        
        return normalized
    
    def _calculateRelevanceScore(self, result: WebSearchResult, queryWords: set) -> float:
        """
        Calculate relevance score for a search result.
        Higher score means more relevant to the query.
        """
        score = 0.0
        
        # Title relevance (most important)
        titleWords = set(result.title.lower().split())
        titleMatches = len(queryWords.intersection(titleWords))
        score += titleMatches * 3.0  # Weight title matches heavily
        
        # URL relevance
        urlWords = set(result.url.lower().split('/'))
        urlMatches = len(queryWords.intersection(urlWords))
        score += urlMatches * 1.5
        
        # Content relevance (if available)
        if hasattr(result, 'rawContent') and result.rawContent:
            contentWords = set(result.rawContent.lower().split())
            contentMatches = len(queryWords.intersection(contentWords))
            score += contentMatches * 0.1  # Lower weight for content matches
        
        # Domain authority bonus (simple heuristic)
        domain = result.url.split('/')[2] if '/' in result.url else result.url
        if any(authDomain in domain.lower() for authDomain in 
               ['wikipedia.org', 'github.com', 'stackoverflow.com', 'reddit.com', 'medium.com']):
            score += 1.0
        
        # Penalty for very long URLs (often less relevant)
        if len(result.url) > 100:
            score -= 0.5
        
        return score
    
    def _intelligentUrlFiltering(self, searchResults: List[WebSearchResult], query: str, maxResults: int) -> List[WebSearchResult]:
        """
        Intelligent URL filtering with de-duplication and relevance scoring.
        
        Args:
            searchResults: Raw search results from Tavily
            query: Original search query for relevance scoring
            maxResults: Maximum number of results to return
            
        Returns:
            Filtered and deduplicated list of search results
        """
        if not searchResults:
            return []
        
        # Step 1: Basic de-duplication by URL
        seenUrls = set()
        uniqueResults = []
        
        for result in searchResults:
            # Normalize URL for better deduplication
            normalizedUrl = self._normalizeUrl(result.url)
            if normalizedUrl not in seenUrls:
                seenUrls.add(normalizedUrl)
                uniqueResults.append(result)
        
        logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original")
        
        # Step 2: Relevance scoring and filtering
        scoredResults = []
        queryWords = set(query.lower().split())
        
        for result in uniqueResults:
            score = self._calculateRelevanceScore(result, queryWords)
            scoredResults.append((score, result))
        
        # Step 3: Sort by relevance score (higher is better)
        scoredResults.sort(key=lambda x: x[0], reverse=True)
        
        # Step 4: Take top results
        filteredResults = [result for score, result in scoredResults[:maxResults]]
        
        logger.info(f"After intelligent filtering: {len(filteredResults)} results selected from {len(uniqueResults)} unique")
        
        return filteredResults
    
    @classmethod
    async def create(cls):
        apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
        if not apiKey:
            raise ValueError("Tavily API key not configured. Please set Connector_AiTavily_API_SECRET in config.ini")
        # Load and cache web crawl related configuration
        crawlTimeout = int(APP_CONFIG.get("Web_Crawl_TIMEOUT", "30"))
        crawlMaxRetries = int(APP_CONFIG.get("Web_Crawl_MAX_RETRIES", "3"))
        crawlRetryDelay = int(APP_CONFIG.get("Web_Crawl_RETRY_DELAY", "2"))
        return cls(
            client=AsyncTavilyClient(api_key=apiKey),
            crawlTimeout=crawlTimeout,
            crawlMaxRetries=crawlMaxRetries,
            crawlRetryDelay=crawlRetryDelay,
            webSearchMinResults=int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1")),
            webSearchMaxResults=int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20")),
        )

    # Standardized method using AiModelCall/AiModelResponse pattern


    def _cleanUrl(self, url: str) -> str:
        """Clean URL by removing extra text that might be appended."""
        import re
        # Extract just the URL part, removing any extra text after it
        urlMatch = re.match(r'(https?://[^\s,]+)', url)
        if urlMatch:
            return urlMatch.group(1)
        return url
    
    async def _search(
        self,
        query: str,
        maxResults: int,
        searchDepth: str | None = None,
        timeRange: str | None = None,
        topic: str | None = None,
        includeDomains: list[str] | None = None,
        excludeDomains: list[str] | None = None,
        country: str | None = None,
        includeAnswer: str | None = None,
        includeRawContent: str | None = None,
    ) -> list[WebSearchResult]:
        """Calls the Tavily API to perform a web search."""
        # Make sure maxResults is within the allowed range (use cached values)
        minResults = self.webSearchMinResults
        maxAllowedResults = self.webSearchMaxResults
        if maxResults < minResults or maxResults > maxAllowedResults:
            raise ValueError(f"maxResults must be between {minResults} and {maxAllowedResults}")

        # Perform actual API call
        # Build kwargs only for provided options to avoid API rejections
        kwargs: dict = {"query": query, "max_results": maxResults}
        if searchDepth is not None:
            kwargs["search_depth"] = searchDepth
        if timeRange is not None:
            kwargs["time_range"] = timeRange
        if topic is not None:
            kwargs["topic"] = topic
        if includeDomains is not None and len(includeDomains) > 0:
            kwargs["include_domains"] = includeDomains
        if excludeDomains is not None:
            kwargs["exclude_domains"] = excludeDomains
        if country is not None:
            kwargs["country"] = country
        if includeAnswer is not None:
            kwargs["include_answer"] = includeAnswer
        if includeRawContent is not None:
            kwargs["include_raw_content"] = includeRawContent

        # Log the final API call parameters for comparison
        logger.info(f"Tavily API call parameters: {kwargs}")
        
        # Ensure client is initialized
        if self.client is None:
            self._initializeClient()
            if self.client is None:
                raise ValueError("Tavily client not initialized. Please check API key configuration.")
        
        response = await self.client.search(**kwargs)
        
        # Return all results without score filtering
        # Tavily's scoring is already applied by the API
        logger.info(f"Tavily returned {len(response.get('results', []))} results")
        
        return [
            WebSearchResult(
                title=result["title"], 
                url=self._cleanUrl(result["url"]),
                rawContent=result.get("raw_content")
            )
            for result in response["results"]
        ]

    async def _crawl(
        self,
        url: str,
        instructions: str | None = None,
        limit: int = 20,
        maxDepth: int = 2,
        maxBreadth: int = 40,
    ) -> list[WebCrawlResult]:
        """Calls the Tavily API to crawl ONE URL with link following and retry logic."""
        maxRetries = self.crawlMaxRetries
        retryDelay = self.crawlRetryDelay
        timeout = self.crawlTimeout
        
        logger.debug(f"Starting crawl of URL: {url}")
        logger.debug(f"Crawl settings: instructions={instructions}, limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s")

        for attempt in range(maxRetries + 1):
            try:
                logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}")
                
                # Ensure client is initialized
                if self.client is None:
                    self._initializeClient()
                    if self.client is None:
                        raise ValueError("Tavily client not initialized. Please check API key configuration.")
                
                logger.debug(f"Crawling URL: {url}")
                
                # Build kwargs for crawl
                kwargsCrawl: dict = {"url": url}
                if instructions:
                    kwargsCrawl["instructions"] = instructions
                if limit:
                    kwargsCrawl["limit"] = limit
                if maxDepth:
                    kwargsCrawl["max_depth"] = maxDepth
                if maxBreadth:
                    kwargsCrawl["max_breadth"] = maxBreadth
                
                logger.debug(f"Sending request to Tavily with kwargs: {kwargsCrawl}")
                
                response = await asyncio.wait_for(
                    self.client.crawl(**kwargsCrawl),
                    timeout=timeout
                )
                
                logger.debug(f"Tavily response received: {type(response)}")
                
                # Parse response - could be dict with results or list
                if isinstance(response, dict) and "results" in response:
                    pageResults = response["results"]
                elif isinstance(response, list):
                    pageResults = response
                else:
                    logger.warning(f"Unexpected response format: {type(response)}")
                    pageResults = []
                
                logger.debug(f"Got {len(pageResults)} pages from crawl")
                
                # Convert to WebCrawlResult format
                results = []
                for result in pageResults:
                    results.append(WebCrawlResult(
                        url=result.get("url", url),
                        content=result.get("raw_content", result.get("content", "")),
                        title=result.get("title", "")
                    ))
                
                logger.debug(f"Crawl successful: extracted {len(results)} pages from URL")
                return results
                
            except asyncio.TimeoutError:
                logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URL: {url}")
                if attempt < maxRetries:
                    logger.info(f"Retrying in {retryDelay} seconds...")
                    await asyncio.sleep(retryDelay)
                else:
                    raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout")
                    
            except Exception as e:
                logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}")
                logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")
                
                # Check if it's a validation error and log more details
                if "validation" in str(e).lower():
                    logger.debug(f"URL validation failed. Checking URL format:")
                    logger.debug(f"  URL: '{url}' (length: {len(url)})")
                    # Check for common URL issues
                    if ' ' in url:
                        logger.debug(f"    WARNING: URL contains spaces!")
                    if not url.startswith(('http://', 'https://')):
                        logger.debug(f"    WARNING: URL doesn't start with http/https!")
                    if len(url) > 2000:
                        logger.debug(f"    WARNING: URL is very long ({len(url)} chars)")
                
                if attempt < maxRetries:
                    logger.info(f"Retrying in {retryDelay} seconds...")
                    await asyncio.sleep(retryDelay)
                else:
                    raise Exception(f"Crawl failed after {maxRetries + 1} attempts: {str(e)}")
    
    async def _routeWebOperation(self, modelCall: AiModelCall) -> "AiModelResponse":
        """
        Route web operation based on operation type.
        
        Args:
            modelCall: AiModelCall with messages and options
            
        Returns:
            AiModelResponse based on operation type
        """
        operationType = modelCall.options.operationType
        
        if operationType == OperationTypeEnum.WEB_SEARCH:
            return await self.webSearch(modelCall)
        elif operationType == OperationTypeEnum.WEB_CRAWL:
            return await self.webCrawl(modelCall)
        else:
            # Unsupported operation type
            return AiModelResponse(
                content="",
                success=False,
                error=f"Unsupported operation type: {operationType}"
            )
    
    async def webSearch(self, modelCall: AiModelCall) -> "AiModelResponse":
        """
        WEB_SEARCH operation - returns list of URLs using Tavily search.
        
        Args:
            modelCall: AiModelCall with AiCallPromptWebSearch as prompt
            
        Returns:
            AiModelResponse with JSON list of URLs
        """
        try:
            # Extract parameters - find user message (not system message)
            promptContent = ""
            if modelCall.messages:
                for msg in modelCall.messages:
                    if msg.get("role") == "user":
                        promptContent = msg.get("content", "")
                        break
                # Fallback to first message if no user message found
                if not promptContent and len(modelCall.messages) > 0:
                    promptContent = modelCall.messages[0].get("content", "")
            
            if not promptContent or not promptContent.strip():
                raise ValueError("Empty prompt content received for web search")
            
            import json
            try:
                promptData = json.loads(promptContent)
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse prompt content as JSON: {promptContent[:200]}")
                raise ValueError(f"Invalid JSON in prompt content: {str(e)}")
            
            # Create Pydantic model
            webSearchPrompt = AiCallPromptWebSearch(**promptData)
            
            # Convert ISO country code to country name for Tavily
            countryName = webSearchPrompt.country
            if countryName:
                countryName = self._convertIsoCodeToCountryName(countryName)
            
            # Perform search - use exact parameters from prompt
            # NOTE: timeRange parameter causes generic results, so we don't use it
            searchResults = await self._search(
                query=webSearchPrompt.instruction,
                maxResults=webSearchPrompt.maxNumberPages,
                timeRange=None,  # Not used - causes generic results
                country=countryName,
                includeAnswer="basic",
                includeRawContent="text"
            )
            
            # Extract URLs from results
            urls = [result.url for result in searchResults]
            
            # Return as JSON array
            import json
            return AiModelResponse(
                content=json.dumps(urls, indent=2),
                success=True,
                metadata={"total_urls": len(urls), "operation": "WEB_SEARCH"}
            )
            
        except Exception as e:
            logger.error(f"Error in Tavily web search: {str(e)}")
            return AiModelResponse(
                content="[]",
                success=False,
                error=str(e)
            )
    
    async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse":
        """
        WEB_CRAWL operation - crawls one URL using Tavily with link following.
        
        Args:
            modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
            
        Returns:
            AiModelResponse with crawl results as JSON (may include multiple pages)
        """
        try:
            # Extract parameters - find user message (not system message)
            promptContent = ""
            if modelCall.messages:
                for msg in modelCall.messages:
                    if msg.get("role") == "user":
                        promptContent = msg.get("content", "")
                        break
                # Fallback to first message if no user message found
                if not promptContent and len(modelCall.messages) > 0:
                    promptContent = modelCall.messages[0].get("content", "")
            
            if not promptContent or not promptContent.strip():
                raise ValueError("Empty prompt content received for web crawl")
            
            import json
            try:
                promptData = json.loads(promptContent)
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse prompt content as JSON: {promptContent[:200]}")
                raise ValueError(f"Invalid JSON in prompt content: {str(e)}")
            
            # Create Pydantic model
            webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
            
            # Perform crawl for ONE URL with link following
            # Use maxWidth as limit, maxDepth as maxDepth, and calculate maxBreadth
            crawlResults = await self._crawl(
                url=webCrawlPrompt.url,
                instructions=webCrawlPrompt.instruction,
                limit=webCrawlPrompt.maxWidth or 20,  # maxWidth controls number of pages
                maxDepth=webCrawlPrompt.maxDepth or 2,
                maxBreadth=webCrawlPrompt.maxWidth or 40  # Use same as limit for breadth
            )
            
            # If we got multiple pages from the crawl, we need to format them differently
            # Return the first result for backwards compatibility, but include total page count
            if crawlResults and len(crawlResults) > 0:
                # Get all pages content
                allContent = ""
                for i, result in enumerate(crawlResults, 1):
                    pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n"
                    if result.title:
                        allContent += f"{pageHeader}Title: {result.title}\n\n"
                    allContent += f"{result.content}\n"
                
                resultData = {
                    "url": webCrawlPrompt.url,
                    "title": crawlResults[0].title if crawlResults[0].title else "Content",
                    "content": allContent,
                    "pagesCrawled": len(crawlResults),
                    "pageUrls": [result.url for result in crawlResults]
                }
            else:
                resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0}
            
            # Return as JSON - same format as Perplexity but with multiple pages content
            import json
            return AiModelResponse(
                content=json.dumps(resultData, indent=2),
                success=True,
                metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url, "pagesCrawled": len(crawlResults) if crawlResults else 0}
            )
            
        except Exception as e:
            logger.error(f"Error in Tavily web crawl: {str(e)}")
            import json
            errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""}
            return AiModelResponse(
                content=json.dumps(errorResult, indent=2),
                success=False,
                error=str(e)
            )