gateway/modules/interfaces/interfaceAiObjects.py

import logging
import asyncio
from typing import Dict, Any, List, Union, Tuple, Optional
from dataclasses import dataclass
import time

logger = logging.getLogger(__name__)

from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.aicore.aicoreModelSelector import modelSelector
from modules.datamodels.datamodelAi import (
    AiModel,
    AiCallOptions,
    AiCallRequest,
    AiCallResponse,
    OperationTypeEnum,
    AiModelCall,
    AiModelResponse,
)


# Dynamic model registry - models are now loaded from connectors via aicore system


@dataclass(slots=True)
class AiObjects:
    """Centralized AI interface: dynamically discovers and uses AI models. Includes web functionality."""

    def __post_init__(self) -> None:
        # Auto-discover and register all available connectors
        self._discoverAndRegisterConnectors()

    def _discoverAndRegisterConnectors(self):
        """Auto-discover and register all available AI connectors."""
        logger.info("Auto-discovering AI connectors...")

        # Use the model registry's built-in discovery mechanism
        discoveredConnectors = modelRegistry.discoverConnectors()

        # Register each discovered connector
        for connector in discoveredConnectors:
            modelRegistry.registerConnector(connector)
            logger.info(f"Registered connector: {connector.getConnectorType()}")

        logger.info(f"Total connectors registered: {len(discoveredConnectors)}")
        logger.info("All AI connectors registered with dynamic model registry")

    @classmethod
    async def create(cls) -> "AiObjects":
        """Create AiObjects instance with auto-discovered connectors."""
        # No need to manually create connectors - they're auto-discovered
        return cls()


    def _selectModel(self, prompt: str, context: str, options: AiCallOptions) -> str:
        """Select the best model using dynamic model selection system."""
        # Get available models from the dynamic registry
        availableModels = modelRegistry.getAvailableModels()

        if not availableModels:
            logger.error("No models available in the registry")
            raise ValueError("No AI models available")

        # Use the dynamic model selector
        selectedModel = modelSelector.selectModel(prompt, context, options, availableModels)

        if not selectedModel:
            logger.error("No suitable model found for the given criteria")
            raise ValueError("No suitable AI model found")

        logger.info(f"Selected model: {selectedModel.name} ({selectedModel.displayName})")
        return selectedModel.name


    async def call(self, request: AiCallRequest) -> AiCallResponse:
        """Call AI model for text generation with model-aware chunking."""
        # Handle content parts (unified path)
        if hasattr(request, 'contentParts') and request.contentParts:
            return await self._callWithContentParts(request)
        # Handle traditional text/context calls
        return await self._callWithTextContext(request)

    async def _callWithTextContext(self, request: AiCallRequest) -> AiCallResponse:
        """Call AI model for traditional text/context calls with fallback mechanism."""
        prompt = request.prompt
        context = request.context or ""
        options = request.options

        # Input bytes will be calculated inside _callWithModel

        # Compress optionally (prompt/context) - simple truncation fallback kept here
        def _maybeTruncate(text: str, limit: int) -> str:
            data = text.encode("utf-8")
            if len(data) <= limit:
                return text
            return data[:limit].decode("utf-8", errors="ignore") + "... [truncated]"

        if options.compressPrompt and len(prompt.encode("utf-8")) > 2000:
            prompt = _maybeTruncate(prompt, 2000)
        if options.compressContext and len(context.encode("utf-8")) > 70000:
            context = _maybeTruncate(context, 70000)

        # Generation parameters are handled inside _callWithModel

        # Get failover models for this operation type
        availableModels = modelRegistry.getAvailableModels()
        failoverModelList = modelSelector.getFailoverModelList(prompt, context, options, availableModels)

        if not failoverModelList:
            errorMsg = f"No suitable models found for operation {options.operationType}"
            logger.error(errorMsg)
            return AiCallResponse(
                content=errorMsg,
                modelName="error",
                priceUsd=0.0,
                processingTime=0.0,
                bytesSent=0,
                bytesReceived=0,
                errorCount=1
            )

        # Try each model in failover sequence
        lastError = None
        for attempt, model in enumerate(failoverModelList):
            try:
                logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")

                # Call the model
                response = await self._callWithModel(model, prompt, context, options)

                logger.info(f"✅ AI call successful with model: {model.name}")
                return response

            except Exception as e:
                lastError = e
                logger.warning(f"❌ AI call failed with model {model.name}: {str(e)}")

                # If this is not the last model, try the next one
                if attempt < len(failoverModelList) - 1:
                    logger.info(f"🔄 Trying next failover model...")
                    continue
                else:
                    # All models failed
                    logger.error(f"💥 All {len(failoverModelList)} models failed for operation {options.operationType}")
                    break

        # All failover attempts failed - return error response
        errorMsg = f"All AI models failed for operation {options.operationType}. Last error: {str(lastError)}"
        logger.error(errorMsg)
        return AiCallResponse(
            content=errorMsg,
            modelName="error",
            priceUsd=0.0,
            processingTime=0.0,
            bytesSent=0,
            bytesReceived=0,
            errorCount=1
        )

    async def _callWithContentParts(self, request: AiCallRequest) -> AiCallResponse:
        """Process content parts with model-aware chunking (unified for single and multiple parts)."""
        prompt = request.prompt
        options = request.options
        contentParts = request.contentParts

        # Get failover models
        availableModels = modelRegistry.getAvailableModels()
        failoverModelList = modelSelector.getFailoverModelList(prompt, "", options, availableModels)

        if not failoverModelList:
            return self._createErrorResponse("No suitable models found", 0, 0)

        # Process each content part
        allResults = []
        for contentPart in contentParts:
            partResult = await self._processContentPartWithFallback(contentPart, prompt, options, failoverModelList)
            allResults.append(partResult)

        # Merge all results
        mergedContent = self._mergePartResults(allResults)

        return AiCallResponse(
            content=mergedContent,
            modelName="multiple",
            priceUsd=sum(r.priceUsd for r in allResults),
            processingTime=sum(r.processingTime for r in allResults),
            bytesSent=sum(r.bytesSent for r in allResults),
            bytesReceived=sum(r.bytesReceived for r in allResults),
            errorCount=sum(r.errorCount for r in allResults)
        )

    async def _processContentPartWithFallback(self, contentPart, prompt: str, options, failoverModelList) -> AiCallResponse:
        """Process a single content part with model-aware chunking and fallback."""
        lastError = None

        for attempt, model in enumerate(failoverModelList):
            try:
                logger.info(f"Processing content part with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")

                # Check if part fits in model context
                partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
                modelContextBytes = model.contextLength * 4  # Convert tokens to bytes

                if partSize <= modelContextBytes:
                    # Part fits - call AI directly
                    response = await self._callWithModel(model, prompt, contentPart.data, options)
                    logger.info(f"✅ Content part processed successfully with model: {model.name}")
                    return response
                else:
                    # Part too large - chunk it
                    chunks = await self._chunkContentPart(contentPart, model, options)
                    if not chunks:
                        raise ValueError(f"Failed to chunk content part for model {model.name}")

                    # Process each chunk
                    chunkResults = []
                    for chunk in chunks:
                        chunkResponse = await self._callWithModel(model, prompt, chunk['data'], options)
                        chunkResults.append(chunkResponse)

                    # Merge chunk results
                    mergedContent = self._mergeChunkResults(chunkResults)
                    totalPrice = sum(r.priceUsd for r in chunkResults)
                    totalTime = sum(r.processingTime for r in chunkResults)
                    totalBytesSent = sum(r.bytesSent for r in chunkResults)
                    totalBytesReceived = sum(r.bytesReceived for r in chunkResults)
                    totalErrors = sum(r.errorCount for r in chunkResults)

                    logger.info(f"✅ Content part chunked and processed with model: {model.name} ({len(chunks)} chunks)")
                    return AiCallResponse(
                        content=mergedContent,
                        modelName=model.name,
                        priceUsd=totalPrice,
                        processingTime=totalTime,
                        bytesSent=totalBytesSent,
                        bytesReceived=totalBytesReceived,
                        errorCount=totalErrors
                    )

            except Exception as e:
                lastError = e
                logger.warning(f"❌ Model {model.name} failed for content part: {str(e)}")

                if attempt < len(failoverModelList) - 1:
                    logger.info(f"🔄 Trying next failover model...")
                    continue
                else:
                    logger.error(f"💥 All {len(failoverModelList)} models failed for content part")
                    break

        # All models failed
        return self._createErrorResponse(f"All models failed: {str(lastError)}", 0, 0)

    async def _chunkContentPart(self, contentPart, model, options) -> List[Dict[str, Any]]:
        """Chunk a content part based on model capabilities."""
        # Calculate model-specific chunk sizes
        modelContextBytes = model.contextLength * 4  # Convert tokens to bytes
        maxContextBytes = int(modelContextBytes * 0.9)  # 90% of context length
        textChunkSize = int(maxContextBytes * 0.7)  # 70% of max context for text chunks
        imageChunkSize = int(maxContextBytes * 0.8)  # 80% of max context for image chunks

        # Build chunking options
        chunkingOptions = {
            "textChunkSize": textChunkSize,
            "imageChunkSize": imageChunkSize,
            "maxSize": maxContextBytes,
            "chunkAllowed": True
        }

        # Get appropriate chunker
        from modules.services.serviceExtraction.subRegistry import ChunkerRegistry
        chunkerRegistry = ChunkerRegistry()
        chunker = chunkerRegistry.resolve(contentPart.typeGroup)

        if not chunker:
            logger.warning(f"No chunker found for typeGroup: {contentPart.typeGroup}")
            return []

        # Chunk the content part
        try:
            chunks = chunker.chunk(contentPart, chunkingOptions)
            logger.debug(f"Created {len(chunks)} chunks for {contentPart.typeGroup} part")
            return chunks
        except Exception as e:
            logger.error(f"Chunking failed for {contentPart.typeGroup}: {str(e)}")
            return []

    def _mergePartResults(self, partResults: List[AiCallResponse]) -> str:
        """Merge part results using the existing sophisticated merging system."""
        if not partResults:
            return ""

        # Convert AiCallResponse results to ContentParts for merging
        from modules.datamodels.datamodelExtraction import ContentPart
        from modules.services.serviceExtraction.subUtils import makeId

        content_parts = []
        for i, result in enumerate(partResults):
            if result.content:
                content_part = ContentPart(
                    id=makeId(),
                    parentId=None,
                    label=f"ai_result_{i}",
                    typeGroup="text",  # Default to text for AI results
                    mimeType="text/plain",
                    data=result.content,
                    metadata={
                        "aiResult": True,
                        "modelName": result.modelName,
                        "priceUsd": result.priceUsd,
                        "processingTime": result.processingTime,
                        "bytesSent": result.bytesSent,
                        "bytesReceived": result.bytesReceived
                    }
                )
                content_parts.append(content_part)

        # Use existing merging system
        merge_strategy = {
            "useIntelligentMerging": True,
            "groupBy": "typeGroup",
            "orderBy": "id",
            "mergeType": "concatenate"
        }

        from modules.services.serviceExtraction.subPipeline import _applyMerging
        merged_parts = _applyMerging(content_parts, merge_strategy)

        # Convert merged parts back to final string
        final_content = "\n\n".join([part.data for part in merged_parts])

        logger.info(f"Merged {len(partResults)} AI results using existing merging system")
        return final_content.strip()

    def _mergeChunkResults(self, chunkResults: List[AiCallResponse]) -> str:
        """Merge chunk results using the existing sophisticated merging system."""
        if not chunkResults:
            return ""

        # Convert AiCallResponse results to ContentParts for merging
        from modules.datamodels.datamodelExtraction import ContentPart
        from modules.services.serviceExtraction.subUtils import makeId

        content_parts = []
        for i, result in enumerate(chunkResults):
            if result.content:
                content_part = ContentPart(
                    id=makeId(),
                    parentId=None,
                    label=f"chunk_result_{i}",
                    typeGroup="text",  # Default to text for AI results
                    mimeType="text/plain",
                    data=result.content,
                    metadata={
                        "aiResult": True,
                        "chunk": True,
                        "modelName": result.modelName,
                        "priceUsd": result.priceUsd,
                        "processingTime": result.processingTime,
                        "bytesSent": result.bytesSent,
                        "bytesReceived": result.bytesReceived
                    }
                )
                content_parts.append(content_part)

        # Use existing merging system
        merge_strategy = {
            "useIntelligentMerging": True,
            "groupBy": "typeGroup",
            "orderBy": "id",
            "mergeType": "concatenate"
        }

        from modules.services.serviceExtraction.subPipeline import _applyMerging
        merged_parts = _applyMerging(content_parts, merge_strategy)

        # Convert merged parts back to final string
        final_content = "\n\n".join([part.data for part in merged_parts])

        logger.info(f"Merged {len(chunkResults)} chunk results using existing merging system")
        return final_content.strip()

    def _createErrorResponse(self, errorMsg: str, inputBytes: int, outputBytes: int) -> AiCallResponse:
        """Create an error response."""
        return AiCallResponse(
            content=errorMsg,
            modelName="error",
            priceUsd=0.0,
            processingTime=0.0,
            bytesSent=inputBytes,
            bytesReceived=outputBytes,
            errorCount=1
        )

    async def _callWithModel(self, model: AiModel, prompt: str, context: str, options: AiCallOptions = None) -> AiCallResponse:
        """Call a specific model and return the response."""
        # Calculate input bytes from prompt and context
        inputBytes = len((prompt + context).encode('utf-8'))

        # Replace <TOKEN_LIMIT> placeholder in prompt for this specific model
        contextLength = model.contextLength
        if contextLength > 0:
            tokenLimit = str(contextLength)
        else:
            tokenLimit = "16000"  # Default for text generation

        # Create a copy of the prompt for this model call
        modelPrompt = prompt
        if "<TOKEN_LIMIT>" in modelPrompt:
            modelPrompt = modelPrompt.replace("<TOKEN_LIMIT>", tokenLimit)
            logger.debug(f"Replaced <TOKEN_LIMIT> with {tokenLimit} for model {model.name}")

        # Update messages array with replaced content
        messages = []
        if context:
            messages.append({"role": "system", "content": f"Context from documents:\n{context}"})
        messages.append({"role": "user", "content": modelPrompt})

        # Start timing
        startTime = time.time()

        # Get the connector for this model
        connector = modelRegistry.getConnectorForModel(model.name)
        if not connector:
            raise ValueError(f"No connector found for model {model.name}")

        # Call the model's function directly - completely generic
        if model.functionCall:
            # Create standardized call object
            modelCall = AiModelCall(
                messages=messages,
                model=model,
                options=options or {}
            )

            # Call the model with standardized interface
            modelResponse = await model.functionCall(modelCall)

            # Extract content from standardized response
            if not modelResponse.success:
                raise ValueError(f"Model call failed: {modelResponse.error}")
            content = modelResponse.content
        else:
            raise ValueError(f"Model {model.name} has no function call defined")

        # Calculate timing and output bytes
        endTime = time.time()
        processingTime = endTime - startTime
        outputBytes = len(content.encode("utf-8"))

        # Calculate price using model's own price calculation method
        priceUsd = model.calculatePriceUsd(inputBytes, outputBytes)

        return AiCallResponse(
            content=content,
            modelName=model.name,
            priceUsd=priceUsd,
            processingTime=processingTime,
            bytesSent=inputBytes,
            bytesReceived=outputBytes,
            errorCount=0
        )

    async def callImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None, options: AiCallOptions = None) -> AiCallResponse:
        """Call AI model for image analysis with fallback mechanism."""

        if options is None:
            options = AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE)

        # Get fallback models for image analysis
        availableModels = modelRegistry.getAvailableModels()
        failoverModelList = modelSelector.getFailoverModelList(prompt, "", options, availableModels)

        if not failoverModelList:
            errorMsg = f"No suitable models found for image analysis"
            logger.error(errorMsg)
            return AiCallResponse(
                content=errorMsg,
                modelName="error",
                priceUsd=0.0,
                processingTime=0.0,
                bytesSent=0,
                bytesReceived=0,
                errorCount=1
            )

        # Try each model in fallback sequence
        lastError = None
        for attempt, model in enumerate(failoverModelList):
            try:
                logger.info(f"Attempting image analysis with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")

                # Call the model
                response = await self._callImageWithModel(model, prompt, imageData, mimeType)

                logger.info(f"✅ Image analysis successful with model: {model.name}")
                return response

            except Exception as e:
                lastError = e
                logger.warning(f"❌ Image analysis failed with model {model.name}: {str(e)}")

                # If this is not the last model, try the next one
                if attempt < len(failoverModelList) - 1:
                    logger.info(f"🔄 Trying next fallback model for image analysis...")
                    continue
                else:
                    # All models failed
                    logger.error(f"💥 All {len(failoverModelList)} models failed for image analysis")
                    break

        # All fallback attempts failed - return error response
        errorMsg = f"All AI models failed for image analysis. Last error: {str(lastError)}"
        logger.error(errorMsg)
        return AiCallResponse(
            content=errorMsg,
            modelName="error",
            priceUsd=0.0,
            processingTime=0.0,
            bytesSent=0,
            bytesReceived=0,
            errorCount=1
        )

    async def _callImageWithModel(self, model: AiModel, prompt: str, imageData: Union[str, bytes], mimeType: str) -> AiCallResponse:
        """Call a specific model for image analysis and return the response."""
        # Calculate input bytes from prompt and image data
        promptBytes = len(prompt.encode('utf-8'))
        if isinstance(imageData, str):
            # Base64 encoded string
            imageBytes = len(imageData.encode('utf-8'))
        else:
            # Raw bytes
            imageBytes = len(imageData)
        inputBytes = promptBytes + imageBytes

        # Start timing
        startTime = time.time()

        # Create standardized call object for image analysis
        modelCall = AiModelCall(
            messages=[{"role": "user", "content": prompt}],
            model=model,
            options={"imageData": imageData, "mimeType": mimeType}
        )

        # Call the model with standardized interface
        if model.functionCall:
            modelResponse = await model.functionCall(modelCall)

            # Extract content from standardized response
            if not modelResponse.success:
                raise ValueError(f"Model call failed: {modelResponse.error}")
            content = modelResponse.content
        else:
            raise ValueError(f"Model {model.name} has no function call defined")

        # Calculate timing and output bytes
        endTime = time.time()
        processingTime = endTime - startTime
        outputBytes = len(content.encode("utf-8"))

        # Calculate price using model's own price calculation method
        priceUsd = model.calculatePriceUsd(inputBytes, outputBytes)

        return AiCallResponse(
            content=content,
            modelName=model.name,
            priceUsd=priceUsd,
            processingTime=processingTime,
            bytesSent=inputBytes,
            bytesReceived=outputBytes,
            errorCount=0
        )

    async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> AiCallResponse:
        """Generate an image using AI."""

        if options is None:
            options = AiCallOptions(operationType=OperationTypeEnum.IMAGE_GENERATE)

        # Calculate input bytes
        inputBytes = len(prompt.encode("utf-8"))

        try:
            # Select the best model for image generation
            modelName = self._selectModel(prompt, "", options)
            selectedModel = modelRegistry.getModel(modelName)

            if not selectedModel:
                raise ValueError(f"Selected model {modelName} not found in registry")

            # Get the connector for this model
            connector = modelRegistry.getConnectorForModel(modelName)
            if not connector:
                raise ValueError(f"No connector found for model {modelName}")

            # Start timing
            startTime = time.time()

            # Create standardized call object for image generation
            modelCall = AiModelCall(
                messages=[{"role": "user", "content": prompt}],
                model=selectedModel,
                options={"size": size, "quality": quality, "style": style}
            )

            # Call the model with standardized interface
            if selectedModel.functionCall:
                modelResponse = await selectedModel.functionCall(modelCall)

                # Extract content from standardized response
                if not modelResponse.success:
                    raise ValueError(f"Model call failed: {modelResponse.error}")
                content = modelResponse.content
            else:
                raise ValueError(f"Model {modelName} has no function call defined")

            # Calculate timing and output bytes
            endTime = time.time()
            processingTime = endTime - startTime
            outputBytes = len(content.encode("utf-8"))

            # Calculate price using model's own price calculation method
            priceUsd = selectedModel.calculatePriceUsd(inputBytes, outputBytes)

            logger.info(f"✅ Image generation successful with model: {modelName}")
            return AiCallResponse(
                success=True,
                content=content,
                model=modelName,
                processingTime=processingTime,
                priceUsd=priceUsd,
                bytesSent=inputBytes,
                bytesReceived=outputBytes,
                errorCount=0
            )

        except Exception as e:
            logger.error(f"❌ Image generation failed with model {modelName}: {str(e)}")
            return AiCallResponse(
                content=f"Image generation failed: {str(e)}",
                modelName=modelName,
                priceUsd=0.0,
                processingTime=0.0,
                bytesSent=inputBytes,
                bytesReceived=0,
                errorCount=1
            )

    # Web functionality methods - Now use standardized AiModelCall/AiModelResponse pattern
    async def searchWebsites(self, query: str, maxResults: int = 5, **kwargs) -> str:
        """Search for websites using Tavily with standardized pattern."""
        from modules.datamodels.datamodelAi import AiModelCall

        modelCall = AiModelCall(
            messages=[{"role": "user", "content": query}],
            options={"max_results": maxResults, **kwargs}
        )

        # Get Tavily connector from registry
        tavilyConnector = modelRegistry.getConnectorForModel("tavily_search")
        if not tavilyConnector:
            raise ValueError("Tavily connector not available")

        result = await tavilyConnector.search(modelCall)
        return result.content if result.success else ""

    async def crawlWebsites(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> str:
        """Crawl websites using Tavily with standardized pattern."""
        from modules.datamodels.datamodelAi import AiModelCall

        modelCall = AiModelCall(
            messages=[{"role": "user", "content": "crawl websites"}],
            options={"urls": urls, "extract_depth": extractDepth, "format": format}
        )

        # Get Tavily connector from registry
        tavilyConnector = modelRegistry.getConnectorForModel("tavily_crawl")
        if not tavilyConnector:
            raise ValueError("Tavily connector not available")

        result = await tavilyConnector.crawl(modelCall)
        return result.content if result.success else ""

    async def extractContent(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> Dict[str, str]:
        """Extract content from URLs and return as dictionary."""
        import json
        crawlResults = await self.crawlWebsites(urls, extractDepth, format)

        # Parse JSON response and extract content
        try:
            data = json.loads(crawlResults)
            return {result["url"]: result["content"] for result in data.get("results", [])}
        except (json.JSONDecodeError, KeyError):
            return {}

    # Core Web Tools - Clean interface for web operations
    async def readPage(self, url: str, extractDepth: str = "advanced") -> Optional[str]:
        """Read a single web page and return its content (HTML/Markdown)."""
        logger.debug(f"Reading page: {url}")
        try:
            # URL encode the URL to handle spaces and special characters
            from urllib.parse import quote, urlparse, urlunparse
            parsed = urlparse(url)
            encodedUrl = urlunparse((
                parsed.scheme,
                parsed.netloc,
                parsed.path,
                parsed.params,
                parsed.query,
                parsed.fragment
            ))

            # Manually encode query parameters to handle spaces
            if parsed.query:
                encodedQuery = quote(parsed.query, safe='=&')
                encodedUrl = urlunparse((
                    parsed.scheme,
                    parsed.netloc,
                    parsed.path,
                    parsed.params,
                    encodedQuery,
                    parsed.fragment
                ))

            logger.debug(f"URL encoded: {url} -> {encodedUrl}")

            content = await self.extractContent([encodedUrl], extractDepth, "markdown")
            result = content.get(encodedUrl)
            if result:
                logger.debug(f"Successfully read page {encodedUrl}: {len(result)} chars")
            else:
                logger.warning(f"No content returned for page {encodedUrl}")
            return result
        except Exception as e:
            logger.warning(f"Failed to read page {url}: {e}")
            return None

    async def getUrlsFromPage(self, url: str, extractDepth: str = "advanced") -> List[str]:
        """Get all URLs from a web page, with redundancies removed."""
        try:
            content = await self.readPage(url, extractDepth)
            if not content:
                return []

            links = self._extractLinksFromContent(content, url)
            # Remove duplicates while preserving order
            seen = set()
            uniqueLinks = []
            for link in links:
                if link not in seen:
                    seen.add(link)
                    uniqueLinks.append(link)

            logger.debug(f"Extracted {len(uniqueLinks)} unique URLs from {url}")
            return uniqueLinks

        except Exception as e:
            logger.warning(f"Failed to get URLs from page {url}: {e}")
            return []

    def filterUrlsOnlyPages(self, urls: List[str], maxPerDomain: int = 10) -> List[str]:
        """Filter URLs to get only links for pages to follow (no images, etc.)."""
        from urllib.parse import urlparse

        def _isHtmlCandidate(url: str) -> bool:
            lower = url.lower()
            blocked = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp',
                       '.mp4', '.mp3', '.avi', '.mov', '.mkv',
                       '.pdf', '.zip', '.rar', '.7z', '.tar', '.gz',
                       '.css', '.js', '.woff', '.woff2', '.ttf', '.eot')
            return not lower.endswith(blocked)

        # Group by domain
        domainLinks = {}
        for link in urls:
            domain = urlparse(link).netloc
            if domain not in domainLinks:
                domainLinks[domain] = []
            domainLinks[domain].append(link)

        # Filter and cap per domain
        filteredLinks = []
        for domain, domainLinkList in domainLinks.items():
            seen = set()
            domainFiltered = []

            for link in domainLinkList:
                if link in seen:
                    continue
                if not _isHtmlCandidate(link):
                    continue
                seen.add(link)
                domainFiltered.append(link)
                if len(domainFiltered) >= maxPerDomain:
                    break

            filteredLinks.extend(domainFiltered)
            logger.debug(f"Domain {domain}: {len(domainLinkList)} -> {len(domainFiltered)} links")

        return filteredLinks

    def _extractLinksFromContent(self, content: str, baseUrl: str) -> List[str]:
        """Extract links from HTML/Markdown content."""
        try:
            import re
            from urllib.parse import urljoin, urlparse, quote, urlunparse

            def _cleanUrl(url: str) -> str:
                """Clean and encode URL to remove spaces and invalid characters."""
                # Remove quotes and extra spaces
                url = url.strip().strip('"\'')

                # If it's a relative URL, make it absolute first
                if not url.startswith(('http://', 'https://')):
                    url = urljoin(baseUrl, url)

                # Parse and re-encode the URL properly
                parsed = urlparse(url)
                if parsed.query:
                    # Encode query parameters properly
                    encodedQuery = quote(parsed.query, safe='=&')
                    url = urlunparse((
                        parsed.scheme,
                        parsed.netloc,
                        parsed.path,
                        parsed.params,
                        encodedQuery,
                        parsed.fragment
                    ))

                return url

            links = []

            # Extract HTML links: <a href="url"> format
            htmlLinkPattern = r'<a[^>]+href=["\']([^"\']+)["\'][^>]*>'
            htmlLinks = re.findall(htmlLinkPattern, content, re.IGNORECASE)

            for url in htmlLinks:
                if url and not url.startswith('#') and not url.startswith('javascript:'):
                    try:
                        cleanedUrl = _cleanUrl(url)
                        links.append(cleanedUrl)
                        logger.debug(f"Extracted HTML link: {url} -> {cleanedUrl}")
                    except Exception as e:
                        logger.debug(f"Failed to clean HTML link {url}: {e}")

            # Extract markdown links: [text](url) format
            markdownLinkPattern = r'\[([^\]]+)\]\(([^)]+)\)'
            markdownLinks = re.findall(markdownLinkPattern, content)

            for text, url in markdownLinks:
                if url and not url.startswith('#'):
                    try:
                        cleanedUrl = _cleanUrl(url)
                        # Only keep URLs from the same domain
                        if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc:
                            links.append(cleanedUrl)
                            logger.debug(f"Extracted markdown link: {url} -> {cleanedUrl}")
                    except Exception as e:
                        logger.debug(f"Failed to clean markdown link {url}: {e}")

            # Extract plain URLs in the text
            urlPattern = r'https?://[^\s\)]+'
            plainUrls = re.findall(urlPattern, content)

            for url in plainUrls:
                try:
                    cleanUrl = url.rstrip('.,;!?')
                    cleanedUrl = _cleanUrl(cleanUrl)
                    if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc:
                        if cleanedUrl not in links:  # Avoid duplicates
                            links.append(cleanedUrl)
                            logger.debug(f"Extracted plain URL: {url} -> {cleanedUrl}")
                except Exception as e:
                    logger.debug(f"Failed to clean plain URL {url}: {e}")

            logger.debug(f"Total links extracted and cleaned: {len(links)}")
            return links

        except Exception as e:
            logger.warning(f"Failed to extract links from content: {e}")
            return []

    def _normalizeUrl(self, url: str) -> str:
        """Normalize URL to handle variations that should be considered duplicates."""
        if not url:
            return url

        # Remove trailing slashes and fragments
        url = url.rstrip('/')
        if '#' in url:
            url = url.split('#')[0]

        # Handle common URL variations
        url = url.replace('http://', 'https://')  # Normalize protocol

        return url

    async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
        """
        Recursively crawl URLs up to specified depth.

        Args:
            urls: List of starting URLs to crawl
            max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
            extract_depth: Tavily extract depth setting
            max_per_domain: Maximum URLs per domain per level
            global_processed_urls: Optional global set to track processed URLs across sessions

        Returns:
            Dictionary mapping URL -> content for all crawled pages
        """
        logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")

        # URL index to track all processed URLs (local + global)
        processed_urls = set()
        if global_processed_urls is not None:
            # Use global index if provided, otherwise create local one
            processed_urls = global_processed_urls
            logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
        else:
            logger.info("Using local URL index for this crawl session")

        all_content = {}

        # Current level URLs to process
        current_level_urls = urls.copy()

        try:
            for depth in range(1, max_depth + 1):
                logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
                logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")

                # URLs found at this level (for next iteration)
                next_level_urls = []

                for url in current_level_urls:
                    # Normalize URL for duplicate checking
                    normalized_url = self._normalizeUrl(url)
                    if normalized_url in processed_urls:
                        logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
                        continue

                    try:
                        logger.info(f"Processing URL at depth {depth}: {url}")
                        logger.debug(f"Total processed URLs so far: {len(processed_urls)}")

                        # Read page content
                        content = await self.readPage(url, extract_depth)
                        if content:
                            all_content[url] = content
                            processed_urls.add(normalized_url)
                            logger.info(f"✓ Successfully processed {url}: {len(content)} chars")

                            # Get URLs from this page for next level
                            page_urls = await self.getUrlsFromPage(url, extract_depth)
                            logger.info(f"Found {len(page_urls)} URLs on {url}")

                            # Filter URLs and add to next level
                            filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
                            logger.info(f"Filtered to {len(filtered_urls)} valid URLs")

                            # Add new URLs to next level (avoiding already processed ones)
                            new_urls_count = 0
                            for new_url in filtered_urls:
                                normalized_new_url = self._normalizeUrl(new_url)
                                if normalized_new_url not in processed_urls:
                                    next_level_urls.append(new_url)
                                    new_urls_count += 1
                                else:
                                    logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping")

                            logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
                        else:
                            logger.warning(f"✗ No content extracted from {url}")
                            processed_urls.add(normalized_url)  # Mark as processed to avoid retry

                    except Exception as e:
                        logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
                        processed_urls.add(normalized_url)  # Mark as processed to avoid retry

                # Prepare for next iteration
                current_level_urls = next_level_urls
                logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")

                # Stop if no more URLs to process
                if not current_level_urls:
                    logger.info(f"No more URLs found at depth {depth}, stopping recursion")
                    break

            logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
            logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}")
            logger.info(f"Unique URLs found: {len(all_content)}")
            return all_content

        except asyncio.TimeoutError:
            logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far")
            return all_content
        except Exception as e:
            logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
            return all_content

    async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> AiCallResponse:
        """Use Perplexity AI to provide the best answers for web-related queries."""

        if options is None:
            options = AiCallOptions(operationType=OperationTypeEnum.WEB_RESEARCH)

        # Calculate input bytes
        inputBytes = len((query + context).encode("utf-8"))

        # Create a comprehensive prompt for web queries
        webPrompt = f"""You are an expert web researcher and information analyst. Please provide a comprehensive and accurate answer to the following web-related query.

Query: {query}

{f"Additional Context: {context}" if context else ""}

Please provide:
1. A clear, well-structured answer to the query
2. Key points and important details
3. Relevant insights and analysis
4. Any important considerations or caveats
5. Suggestions for further research if applicable

Format your response in a clear, professional manner that would be helpful for someone researching this topic."""

        try:
            # Start timing
            startTime = time.time()

            # Use Perplexity for web research with search capabilities
            perplexity_connector = modelRegistry.getConnectorForModel("perplexity_callAiWithWebSearch")
            if not perplexity_connector:
                raise ValueError("Perplexity connector not available")
            response = await perplexity_connector.callAiWithWebSearch(webPrompt)

            # Calculate timing and output bytes
            endTime = time.time()
            processingTime = endTime - startTime
            outputBytes = len(response.encode("utf-8"))

            # Calculate price using Perplexity model pricing
            perplexity_model = modelRegistry.getModel("perplexity_callAiWithWebSearch")
            priceUsd = perplexity_model.calculatePriceUsd(inputBytes, outputBytes)

            logger.info(f"✅ Web query successful with Perplexity")
            return AiCallResponse(
                content=response,
                modelName="perplexity_callAiWithWebSearch",
                priceUsd=priceUsd,
                processingTime=processingTime,
                bytesSent=inputBytes,
                bytesReceived=outputBytes,
                errorCount=0
            )
        except Exception as e:
            logger.error(f"Perplexity web query failed: {str(e)}")
            return AiCallResponse(
                content=f"Web query failed: {str(e)}",
                modelName="perplexity_callAiWithWebSearch",
                priceUsd=0.0,
                processingTime=0.0,
                bytesSent=inputBytes,
                bytesReceived=0,
                errorCount=1
            )

    # Utility methods
    async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]:
        """List available models, optionally filtered by connector type."""
        models = modelRegistry.getAvailableModels()
        if connectorType:
            return [model.dict() for model in models if model.connectorType == connectorType]
        return [model.dict() for model in models]

    async def getModelInfo(self, modelName: str) -> Dict[str, Any]:
        """Get information about a specific model."""
        model = modelRegistry.getModel(modelName)
        if not model:
            raise ValueError(f"Model {modelName} not found")
        return model.dict()


    async def getModelsByTag(self, tag: str) -> List[str]:
        """Get model names that have a specific tag."""
        models = modelRegistry.getModelsByTag(tag)
        return [model.name for model in models]

    async def selectRelevantWebsites(self, websites: List[str], userQuestion: str) -> Tuple[List[str], str]:
        """Select most relevant websites using AI analysis. Returns (selected_websites, ai_response)."""
        if len(websites) <= 1:
            return websites, "Only one website available, no selection needed"

        try:
            # Create website summaries for AI analysis
            websiteSummaries = []
            for i, url in enumerate(websites, 1):
                from urllib.parse import urlparse
                domain = urlparse(url).netloc
                summary = f"{i}. {url} (Domain: {domain})"
                websiteSummaries.append(summary)

            selectionPrompt = f"""
            Based on this user request: "{userQuestion}"

            I have {len(websites)} websites found. Please select the most relevant website(s) for this request.

            Available websites:
            {chr(10).join(websiteSummaries)}

            Please respond with the website number(s) (1, 2, 3, etc.) that are most relevant.
            Format: 1,3,5 (or just 1 for single selection)
            """

            # Use Perplexity to select the best websites
            response = await self.webQuery(selectionPrompt)

            # Parse the selection
            import re
            numbers = re.findall(r'\d+', response)
            if numbers:
                selectedWebsites = []
                for num in numbers:
                    index = int(num) - 1
                    if 0 <= index < len(websites):
                        selectedWebsites.append(websites[index])

                if selectedWebsites:
                    logger.info(f"AI selected {len(selectedWebsites)} websites")
                    return selectedWebsites, response

            # Fallback to first website
            logger.warning("AI selection failed, using first website")
            return websites[:1], f"AI selection failed, fallback to first website. AI response: {response}"

        except Exception as e:
            logger.error(f"Error in website selection: {str(e)}")
            return websites[:1], f"Error in website selection: {str(e)}"

    async def analyzeContentWithChunking(self, allContent: Dict[str, str], userQuestion: str) -> str:
        """Analyze content using AI with chunking for large content."""
        logger.info(f"Analyzing {len(allContent)} websites with AI")

        # Process content in chunks to avoid token limits
        chunkSize = 50000  # 50k chars per chunk
        allChunks = []

        for url, content in allContent.items():
            filteredContent = self._filterContent(content)
            if len(filteredContent) <= chunkSize:
                allChunks.append((url, filteredContent))
                logger.info(f"Content from {url}: {len(filteredContent)} chars (single chunk)")
            else:
                # Split large content into chunks
                chunkCount = (len(filteredContent) + chunkSize - 1) // chunkSize
                logger.info(f"Content from {url}: {len(filteredContent)} chars (split into {chunkCount} chunks)")
                for i in range(0, len(filteredContent), chunkSize):
                    chunk = filteredContent[i:i+chunkSize]
                    chunkNum = i//chunkSize + 1
                    allChunks.append((f"{url} (part {chunkNum})", chunk))

        logger.info(f"Processing {len(allChunks)} content chunks")

        # Analyze each chunk
        chunkAnalyses = []
        for i, (url, chunk) in enumerate(allChunks, 1):
            logger.info(f"Analyzing chunk {i}/{len(allChunks)}: {url}")

            try:
                analysisPrompt = f"""
                Analyze this web content and extract relevant information for: {userQuestion}

                Source: {url}
                Content: {chunk}

                Please extract key information relevant to the query.
                """

                analysis = await self.webQuery(analysisPrompt)
                chunkAnalyses.append(analysis)
                logger.info(f"Chunk {i}/{len(allChunks)} analyzed successfully")

            except Exception as e:
                logger.error(f"Chunk {i}/{len(allChunks)} error: {e}")

        # Combine all chunk analyses
        if chunkAnalyses:
            logger.info(f"Combining {len(chunkAnalyses)} chunk analyses")
            combinedAnalysis = "\n\n".join(chunkAnalyses)

            # Final synthesis
            try:
                logger.info("Performing final synthesis of all analyses")
                synthesisPrompt = f"""
                Based on these partial analyses, provide a comprehensive answer to: {userQuestion}

                Partial analyses:
                {combinedAnalysis}

                Please provide a clear, well-structured answer to the query.
                """

                finalAnalysis = await self.webQuery(synthesisPrompt)
                logger.info("Final synthesis completed successfully")
                return finalAnalysis

            except Exception as e:
                logger.error(f"Synthesis error: {e}")
                return combinedAnalysis
        else:
            logger.error("No content could be analyzed")
            return "No content could be analyzed"

    def _filterContent(self, content: str) -> str:
        """Filter out navigation, ads, and other nonsense content."""
        lines = content.split('\n')
        filteredLines = []

        for line in lines:
            line = line.strip()
            # Skip empty lines
            if not line:
                continue
            # Skip navigation elements
            if any(skip in line.lower() for skip in [
                'toggle navigation', 'log in', 'sign up', 'cookies', 'privacy policy',
                'terms of service', 'subscribe', 'newsletter', 'follow us', 'share this',
                'advertisement', 'sponsored', 'banner', 'popup', 'modal'
            ]):
                continue
            # Skip image references without context
            if line.startswith('![Image') and '](' in line:
                continue
            # Skip pure links without context
            if line.startswith('[') and line.endswith(')') and '---' in line:
                continue
            # Keep meaningful content
            if len(line) > 10:  # Skip very short lines
                filteredLines.append(line)

        return '\n'.join(filteredLines)