import logging import asyncio from typing import Dict, Any, List, Union, Tuple, Optional from dataclasses import dataclass import time logger = logging.getLogger(__name__) from modules.aicore.aicoreModelRegistry import modelRegistry from modules.aicore.aicoreModelSelector import model_selector from modules.datamodels.datamodelAi import ( AiModel, AiCallOptions, AiCallRequest, AiCallResponse, OperationTypeEnum, ) from modules.datamodels.datamodelWeb import ( WebResearchRequest, WebResearchActionResult, WebSearchResultItem, WebCrawlResultItem, WebSearchRequest, WebCrawlRequest, ) from modules.datamodels.datamodelChat import ActionDocument # Dynamic model registry - models are now loaded from connectors via aicore system @dataclass(slots=True) class AiObjects: """Centralized AI interface: dynamically discovers and uses AI models. Includes web functionality.""" def __post_init__(self) -> None: # Auto-discover and register all available connectors self._discoverAndRegisterConnectors() def _discoverAndRegisterConnectors(self): """Auto-discover and register all available AI connectors.""" logger.info("Auto-discovering AI connectors...") # Use the model registry's built-in discovery mechanism discoveredConnectors = modelRegistry.discoverConnectors() # Register each discovered connector for connector in discoveredConnectors: modelRegistry.registerConnector(connector) logger.info(f"Registered connector: {connector.getConnectorType()}") logger.info(f"Total connectors registered: {len(discoveredConnectors)}") logger.info("All AI connectors registered with dynamic model registry") @classmethod async def create(cls) -> "AiObjects": """Create AiObjects instance with auto-discovered connectors.""" # No need to manually create connectors - they're auto-discovered return cls() def _selectModel(self, prompt: str, context: str, options: AiCallOptions) -> str: """Select the best model using dynamic model selection system.""" # Get available models from the dynamic registry availableModels = modelRegistry.getAvailableModels() if not availableModels: logger.error("No models available in the registry") raise ValueError("No AI models available") # Use the dynamic model selector selectedModel = model_selector.selectModel(prompt, context, options, availableModels) if not selectedModel: logger.error("No suitable model found for the given criteria") raise ValueError("No suitable AI model found") logger.info(f"Selected model: {selectedModel.name} ({selectedModel.displayName})") return selectedModel.name async def call(self, request: AiCallRequest) -> AiCallResponse: """Call AI model for text generation with fallback mechanism.""" prompt = request.prompt context = request.context or "" options = request.options # Calculate input bytes inputBytes = len((prompt + context).encode("utf-8")) # Compress optionally (prompt/context) - simple truncation fallback kept here def _maybeTruncate(text: str, limit: int) -> str: data = text.encode("utf-8") if len(data) <= limit: return text return data[:limit].decode("utf-8", errors="ignore") + "... [truncated]" if options.compressPrompt and len(prompt.encode("utf-8")) > 2000: prompt = _maybeTruncate(prompt, 2000) if options.compressContext and len(context.encode("utf-8")) > 70000: context = _maybeTruncate(context, 70000) # Derive generation parameters temperature = getattr(options, "temperature", None) if temperature is None: temperature = 0.2 maxTokens = getattr(options, "maxTokens", None) # Get fallback models for this operation type availableModels = modelRegistry.getAvailableModels() fallbackModels = model_selector.getFallbackModels(prompt, context, options, availableModels) if not fallbackModels: errorMsg = f"No suitable models found for operation {options.operationType}" logger.error(errorMsg) return AiCallResponse( content=errorMsg, modelName="error", priceUsd=0.0, processingTime=0.0, bytesSent=inputBytes, bytesReceived=0, errorCount=1 ) # Try each model in fallback sequence lastError = None for attempt, model in enumerate(fallbackModels): try: logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(fallbackModels)})") # Call the model response = await self._callWithModel(model, prompt, context, temperature, maxTokens, inputBytes) logger.info(f"✅ AI call successful with model: {model.name}") return response except Exception as e: lastError = e logger.warning(f"❌ AI call failed with model {model.name}: {str(e)}") # If this is not the last model, try the next one if attempt < len(fallbackModels) - 1: logger.info(f"🔄 Trying next fallback model...") continue else: # All models failed logger.error(f"💥 All {len(fallbackModels)} models failed for operation {options.operationType}") break # All fallback attempts failed - return error response errorMsg = f"All AI models failed for operation {options.operationType}. Last error: {str(lastError)}" logger.error(errorMsg) return AiCallResponse( content=errorMsg, modelName="error", priceUsd=0.0, processingTime=0.0, bytesSent=inputBytes, bytesReceived=0, errorCount=1 ) async def _callWithModel(self, model: AiModel, prompt: str, context: str, temperature: float, maxTokens: int, inputBytes: int) -> AiCallResponse: """Call a specific model and return the response.""" # Replace placeholder in prompt for this specific model contextLength = model.contextLength if contextLength > 0: tokenLimit = str(contextLength) else: tokenLimit = "16000" # Default for text generation # Create a copy of the prompt for this model call modelPrompt = prompt if "" in modelPrompt: modelPrompt = modelPrompt.replace("", tokenLimit) logger.debug(f"Replaced with {tokenLimit} for model {model.name}") # Update messages array with replaced content messages = [] if context: messages.append({"role": "system", "content": f"Context from documents:\n{context}"}) messages.append({"role": "user", "content": modelPrompt}) # Start timing startTime = time.time() # Get the connector for this model connector = modelRegistry.getConnectorForModel(model.name) if not connector: raise ValueError(f"No connector found for model {model.name}") # Call the model's function directly if model.functionCall: # Use the model's function call directly if model.name.startswith("perplexity_callAiWithWebSearch"): query = modelPrompt if context: query = f"Context: {context}\n\nQuery: {modelPrompt}" content = await model.functionCall(query, temperature=temperature, maxTokens=maxTokens) elif model.name.startswith("perplexity_researchTopic"): content = await model.functionCall(modelPrompt) elif model.name.startswith("perplexity_answerQuestion"): content = await model.functionCall(modelPrompt, context) elif model.name.startswith("perplexity_getCurrentNews"): content = await model.functionCall(modelPrompt) else: # Standard callAiBasic if model.connectorType == "anthropic": response = await model.functionCall(messages, temperature=temperature, maxTokens=maxTokens) content = response["choices"][0]["message"]["content"] else: content = await model.functionCall(messages, temperature=temperature, maxTokens=maxTokens) else: raise ValueError(f"Model {model.name} has no function call defined") # Calculate timing and output bytes endTime = time.time() processingTime = endTime - startTime outputBytes = len(content.encode("utf-8")) # Calculate price using model's own price calculation method priceUsd = model.calculatePriceUsd(inputBytes, outputBytes) return AiCallResponse( content=content, modelName=model.name, priceUsd=priceUsd, processingTime=processingTime, bytesSent=inputBytes, bytesReceived=outputBytes, errorCount=0 ) async def callImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None, options: AiCallOptions = None) -> AiCallResponse: """Call AI model for image analysis with fallback mechanism.""" if options is None: options = AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE) # Calculate input bytes (prompt + image data) inputBytes = len(prompt.encode("utf-8")) + len(imageData) if isinstance(imageData, bytes) else len(prompt.encode("utf-8")) + len(str(imageData).encode("utf-8")) # Get fallback models for image analysis availableModels = modelRegistry.getAvailableModels() fallbackModels = model_selector.getFallbackModels(prompt, "", options, availableModels) if not fallbackModels: errorMsg = f"No suitable models found for image analysis" logger.error(errorMsg) return AiCallResponse( content=errorMsg, modelName="error", priceUsd=0.0, processingTime=0.0, bytesSent=inputBytes, bytesReceived=0, errorCount=1 ) # Try each model in fallback sequence lastError = None for attempt, model in enumerate(fallbackModels): try: logger.info(f"Attempting image analysis with model: {model.name} (attempt {attempt + 1}/{len(fallbackModels)})") # Call the model response = await self._callImageWithModel(model, prompt, imageData, mimeType, inputBytes) logger.info(f"✅ Image analysis successful with model: {model.name}") return response except Exception as e: lastError = e logger.warning(f"❌ Image analysis failed with model {model.name}: {str(e)}") # If this is not the last model, try the next one if attempt < len(fallbackModels) - 1: logger.info(f"🔄 Trying next fallback model for image analysis...") continue else: # All models failed logger.error(f"💥 All {len(fallbackModels)} models failed for image analysis") break # All fallback attempts failed - return error response errorMsg = f"All AI models failed for image analysis. Last error: {str(lastError)}" logger.error(errorMsg) return AiCallResponse( content=errorMsg, modelName="error", priceUsd=0.0, processingTime=0.0, bytesSent=inputBytes, bytesReceived=0, errorCount=1 ) async def _callImageWithModel(self, model: AiModel, prompt: str, imageData: Union[str, bytes], mimeType: str, inputBytes: int) -> AiCallResponse: """Call a specific model for image analysis and return the response.""" # Start timing startTime = time.time() # Call the model's function directly if model.functionCall: content = await model.functionCall(prompt, imageData, mimeType) else: raise ValueError(f"Model {model.name} has no function call defined") # Calculate timing and output bytes endTime = time.time() processingTime = endTime - startTime outputBytes = len(content.encode("utf-8")) # Calculate price using model's own price calculation method priceUsd = model.calculatePriceUsd(inputBytes, outputBytes) return AiCallResponse( content=content, modelName=model.name, priceUsd=priceUsd, processingTime=processingTime, bytesSent=inputBytes, bytesReceived=outputBytes, errorCount=0 ) async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> AiCallResponse: """Generate an image using AI.""" if options is None: options = AiCallOptions(operationType=OperationTypeEnum.IMAGE_GENERATE) # Calculate input bytes inputBytes = len(prompt.encode("utf-8")) try: # Select the best model for image generation modelName = self._selectModel(prompt, "", options) selectedModel = modelRegistry.getModel(modelName) if not selectedModel: raise ValueError(f"Selected model {modelName} not found in registry") # Get the connector for this model connector = modelRegistry.getConnectorForModel(modelName) if not connector: raise ValueError(f"No connector found for model {modelName}") # Start timing startTime = time.time() # Call the model's function directly if selectedModel.functionCall: result = await selectedModel.functionCall(prompt, size, quality, style) content = str(result) else: raise ValueError(f"Model {modelName} has no function call defined") # Calculate timing and output bytes endTime = time.time() processingTime = endTime - startTime outputBytes = len(content.encode("utf-8")) # Calculate price using model's own price calculation method priceUsd = selectedModel.calculatePriceUsd(inputBytes, outputBytes) logger.info(f"✅ Image generation successful with model: {modelName}") return AiCallResponse( success=True, content=content, model=modelName, processingTime=processingTime, priceUsd=priceUsd, bytesSent=inputBytes, bytesReceived=outputBytes, errorCount=0 ) except Exception as e: logger.error(f"❌ Image generation failed with model {modelName}: {str(e)}") return AiCallResponse( content=f"Image generation failed: {str(e)}", modelName=modelName, priceUsd=0.0, processingTime=0.0, bytesSent=inputBytes, bytesReceived=0, errorCount=1 ) # Web functionality methods - Simple interface to Tavily connector async def searchWebsites(self, query: str, maxResults: int = 5, **kwargs) -> List[WebSearchResultItem]: """Search for websites using Tavily.""" request = WebSearchRequest( query=query, max_results=maxResults, **kwargs ) # Get Tavily connector from registry tavilyConnector = modelRegistry.getConnectorForModel("tavily_search") if not tavilyConnector: raise ValueError("Tavily connector not available") result = await tavilyConnector.search(request) if result.success and result.documents: return result.documents[0].documentData.results return [] async def crawlWebsites(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> List[WebCrawlResultItem]: """Crawl websites using Tavily.""" from pydantic import HttpUrl from urllib.parse import urlparse # Safely create HttpUrl objects with proper scheme handling httpUrls = [] for url in urls: try: # Ensure URL has a scheme parsed = urlparse(url) if not parsed.scheme: url = f"https://{url}" # Use HttpUrl with scheme parameter (this works for all URLs) httpUrls.append(HttpUrl(url, scheme="https")) except Exception as e: logger.warning(f"Skipping invalid URL {url}: {e}") continue if not httpUrls: return [] request = WebCrawlRequest( urls=httpUrls, extract_depth=extractDepth, format=format ) # Get Tavily connector from registry tavilyConnector = modelRegistry.getConnectorForModel("tavily_crawl") if not tavilyConnector: raise ValueError("Tavily connector not available") result = await tavilyConnector.crawl(request) if result.success and result.documents: return result.documents[0].documentData.results return [] async def extractContent(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> Dict[str, str]: """Extract content from URLs and return as dictionary.""" crawlResults = await self.crawlWebsites(urls, extractDepth, format) return {str(result.url): result.content for result in crawlResults} # Core Web Tools - Clean interface for web operations async def readPage(self, url: str, extractDepth: str = "advanced") -> Optional[str]: """Read a single web page and return its content (HTML/Markdown).""" logger.debug(f"Reading page: {url}") try: # URL encode the URL to handle spaces and special characters from urllib.parse import quote, urlparse, urlunparse parsed = urlparse(url) encodedUrl = urlunparse(( parsed.scheme, parsed.netloc, parsed.path, parsed.params, parsed.query, parsed.fragment )) # Manually encode query parameters to handle spaces if parsed.query: encodedQuery = quote(parsed.query, safe='=&') encodedUrl = urlunparse(( parsed.scheme, parsed.netloc, parsed.path, parsed.params, encodedQuery, parsed.fragment )) logger.debug(f"URL encoded: {url} -> {encodedUrl}") content = await self.extractContent([encodedUrl], extractDepth, "markdown") result = content.get(encodedUrl) if result: logger.debug(f"Successfully read page {encodedUrl}: {len(result)} chars") else: logger.warning(f"No content returned for page {encodedUrl}") return result except Exception as e: logger.warning(f"Failed to read page {url}: {e}") return None async def getUrlsFromPage(self, url: str, extractDepth: str = "advanced") -> List[str]: """Get all URLs from a web page, with redundancies removed.""" try: content = await self.readPage(url, extractDepth) if not content: return [] links = self._extractLinksFromContent(content, url) # Remove duplicates while preserving order seen = set() uniqueLinks = [] for link in links: if link not in seen: seen.add(link) uniqueLinks.append(link) logger.debug(f"Extracted {len(uniqueLinks)} unique URLs from {url}") return uniqueLinks except Exception as e: logger.warning(f"Failed to get URLs from page {url}: {e}") return [] def filterUrlsOnlyPages(self, urls: List[str], maxPerDomain: int = 10) -> List[str]: """Filter URLs to get only links for pages to follow (no images, etc.).""" from urllib.parse import urlparse def _isHtmlCandidate(url: str) -> bool: lower = url.lower() blocked = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.mp4', '.mp3', '.avi', '.mov', '.mkv', '.pdf', '.zip', '.rar', '.7z', '.tar', '.gz', '.css', '.js', '.woff', '.woff2', '.ttf', '.eot') return not lower.endswith(blocked) # Group by domain domainLinks = {} for link in urls: domain = urlparse(link).netloc if domain not in domainLinks: domainLinks[domain] = [] domainLinks[domain].append(link) # Filter and cap per domain filteredLinks = [] for domain, domainLinkList in domainLinks.items(): seen = set() domainFiltered = [] for link in domainLinkList: if link in seen: continue if not _isHtmlCandidate(link): continue seen.add(link) domainFiltered.append(link) if len(domainFiltered) >= maxPerDomain: break filteredLinks.extend(domainFiltered) logger.debug(f"Domain {domain}: {len(domainLinkList)} -> {len(domainFiltered)} links") return filteredLinks def _extractLinksFromContent(self, content: str, baseUrl: str) -> List[str]: """Extract links from HTML/Markdown content.""" try: import re from urllib.parse import urljoin, urlparse, quote, urlunparse def _cleanUrl(url: str) -> str: """Clean and encode URL to remove spaces and invalid characters.""" # Remove quotes and extra spaces url = url.strip().strip('"\'') # If it's a relative URL, make it absolute first if not url.startswith(('http://', 'https://')): url = urljoin(baseUrl, url) # Parse and re-encode the URL properly parsed = urlparse(url) if parsed.query: # Encode query parameters properly encodedQuery = quote(parsed.query, safe='=&') url = urlunparse(( parsed.scheme, parsed.netloc, parsed.path, parsed.params, encodedQuery, parsed.fragment )) return url links = [] # Extract HTML links: format htmlLinkPattern = r']+href=["\']([^"\']+)["\'][^>]*>' htmlLinks = re.findall(htmlLinkPattern, content, re.IGNORECASE) for url in htmlLinks: if url and not url.startswith('#') and not url.startswith('javascript:'): try: cleanedUrl = _cleanUrl(url) links.append(cleanedUrl) logger.debug(f"Extracted HTML link: {url} -> {cleanedUrl}") except Exception as e: logger.debug(f"Failed to clean HTML link {url}: {e}") # Extract markdown links: [text](url) format markdownLinkPattern = r'\[([^\]]+)\]\(([^)]+)\)' markdownLinks = re.findall(markdownLinkPattern, content) for text, url in markdownLinks: if url and not url.startswith('#'): try: cleanedUrl = _cleanUrl(url) # Only keep URLs from the same domain if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc: links.append(cleanedUrl) logger.debug(f"Extracted markdown link: {url} -> {cleanedUrl}") except Exception as e: logger.debug(f"Failed to clean markdown link {url}: {e}") # Extract plain URLs in the text urlPattern = r'https?://[^\s\)]+' plainUrls = re.findall(urlPattern, content) for url in plainUrls: try: cleanUrl = url.rstrip('.,;!?') cleanedUrl = _cleanUrl(cleanUrl) if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc: if cleanedUrl not in links: # Avoid duplicates links.append(cleanedUrl) logger.debug(f"Extracted plain URL: {url} -> {cleanedUrl}") except Exception as e: logger.debug(f"Failed to clean plain URL {url}: {e}") logger.debug(f"Total links extracted and cleaned: {len(links)}") return links except Exception as e: logger.warning(f"Failed to extract links from content: {e}") return [] def _normalizeUrl(self, url: str) -> str: """Normalize URL to handle variations that should be considered duplicates.""" if not url: return url # Remove trailing slashes and fragments url = url.rstrip('/') if '#' in url: url = url.split('#')[0] # Handle common URL variations url = url.replace('http://', 'https://') # Normalize protocol return url async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]: """ Recursively crawl URLs up to specified depth. Args: urls: List of starting URLs to crawl max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.) extract_depth: Tavily extract depth setting max_per_domain: Maximum URLs per domain per level global_processed_urls: Optional global set to track processed URLs across sessions Returns: Dictionary mapping URL -> content for all crawled pages """ logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}") # URL index to track all processed URLs (local + global) processed_urls = set() if global_processed_urls is not None: # Use global index if provided, otherwise create local one processed_urls = global_processed_urls logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs") else: logger.info("Using local URL index for this crawl session") all_content = {} # Current level URLs to process current_level_urls = urls.copy() try: for depth in range(1, max_depth + 1): logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===") logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}") # URLs found at this level (for next iteration) next_level_urls = [] for url in current_level_urls: # Normalize URL for duplicate checking normalized_url = self._normalizeUrl(url) if normalized_url in processed_urls: logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping") continue try: logger.info(f"Processing URL at depth {depth}: {url}") logger.debug(f"Total processed URLs so far: {len(processed_urls)}") # Read page content content = await self.readPage(url, extract_depth) if content: all_content[url] = content processed_urls.add(normalized_url) logger.info(f"✓ Successfully processed {url}: {len(content)} chars") # Get URLs from this page for next level page_urls = await self.getUrlsFromPage(url, extract_depth) logger.info(f"Found {len(page_urls)} URLs on {url}") # Filter URLs and add to next level filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain) logger.info(f"Filtered to {len(filtered_urls)} valid URLs") # Add new URLs to next level (avoiding already processed ones) new_urls_count = 0 for new_url in filtered_urls: normalized_new_url = self._normalizeUrl(new_url) if normalized_new_url not in processed_urls: next_level_urls.append(new_url) new_urls_count += 1 else: logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping") logger.info(f"Added {new_urls_count} new URLs to next level from {url}") else: logger.warning(f"✗ No content extracted from {url}") processed_urls.add(normalized_url) # Mark as processed to avoid retry except Exception as e: logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}") processed_urls.add(normalized_url) # Mark as processed to avoid retry # Prepare for next iteration current_level_urls = next_level_urls logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level") # Stop if no more URLs to process if not current_level_urls: logger.info(f"No more URLs found at depth {depth}, stopping recursion") break logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled") logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}") logger.info(f"Unique URLs found: {len(all_content)}") return all_content except asyncio.TimeoutError: logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far") return all_content except Exception as e: logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far") return all_content async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> AiCallResponse: """Use Perplexity AI to provide the best answers for web-related queries.""" if options is None: options = AiCallOptions(operationType=OperationTypeEnum.WEB_RESEARCH) # Calculate input bytes inputBytes = len((query + context).encode("utf-8")) # Create a comprehensive prompt for web queries webPrompt = f"""You are an expert web researcher and information analyst. Please provide a comprehensive and accurate answer to the following web-related query. Query: {query} {f"Additional Context: {context}" if context else ""} Please provide: 1. A clear, well-structured answer to the query 2. Key points and important details 3. Relevant insights and analysis 4. Any important considerations or caveats 5. Suggestions for further research if applicable Format your response in a clear, professional manner that would be helpful for someone researching this topic.""" try: # Start timing startTime = time.time() # Use Perplexity for web research with search capabilities perplexity_connector = modelRegistry.getConnectorForModel("perplexity_callAiWithWebSearch") if not perplexity_connector: raise ValueError("Perplexity connector not available") response = await perplexity_connector.callAiWithWebSearch(webPrompt) # Calculate timing and output bytes endTime = time.time() processingTime = endTime - startTime outputBytes = len(response.encode("utf-8")) # Calculate price using Perplexity model pricing perplexity_model = modelRegistry.getModel("perplexity_callAiWithWebSearch") priceUsd = perplexity_model.calculatePriceUsd(inputBytes, outputBytes) logger.info(f"✅ Web query successful with Perplexity") return AiCallResponse( content=response, modelName="perplexity_callAiWithWebSearch", priceUsd=priceUsd, processingTime=processingTime, bytesSent=inputBytes, bytesReceived=outputBytes, errorCount=0 ) except Exception as e: logger.error(f"Perplexity web query failed: {str(e)}") return AiCallResponse( content=f"Web query failed: {str(e)}", modelName="perplexity_callAiWithWebSearch", priceUsd=0.0, processingTime=0.0, bytesSent=inputBytes, bytesReceived=0, errorCount=1 ) # Utility methods async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]: """List available models, optionally filtered by connector type.""" models = modelRegistry.getAvailableModels() if connectorType: return [model.dict() for model in models if model.connectorType == connectorType] return [model.dict() for model in models] async def getModelInfo(self, modelName: str) -> Dict[str, Any]: """Get information about a specific model.""" model = modelRegistry.getModel(modelName) if not model: raise ValueError(f"Model {modelName} not found") return model.dict() async def getModelsByCapability(self, capability: str) -> List[str]: """Get model names that support a specific capability.""" models = modelRegistry.getModelsByCapability(capability) return [model.name for model in models] async def getModelsByTag(self, tag: str) -> List[str]: """Get model names that have a specific tag.""" models = modelRegistry.getModelsByTag(tag) return [model.name for model in models] async def selectRelevantWebsites(self, websites: List[str], userQuestion: str) -> Tuple[List[str], str]: """Select most relevant websites using AI analysis. Returns (selected_websites, ai_response).""" if len(websites) <= 1: return websites, "Only one website available, no selection needed" try: # Create website summaries for AI analysis websiteSummaries = [] for i, url in enumerate(websites, 1): from urllib.parse import urlparse domain = urlparse(url).netloc summary = f"{i}. {url} (Domain: {domain})" websiteSummaries.append(summary) selectionPrompt = f""" Based on this user request: "{userQuestion}" I have {len(websites)} websites found. Please select the most relevant website(s) for this request. Available websites: {chr(10).join(websiteSummaries)} Please respond with the website number(s) (1, 2, 3, etc.) that are most relevant. Format: 1,3,5 (or just 1 for single selection) """ # Use Perplexity to select the best websites response = await self.webQuery(selectionPrompt) # Parse the selection import re numbers = re.findall(r'\d+', response) if numbers: selectedWebsites = [] for num in numbers: index = int(num) - 1 if 0 <= index < len(websites): selectedWebsites.append(websites[index]) if selectedWebsites: logger.info(f"AI selected {len(selectedWebsites)} websites") return selectedWebsites, response # Fallback to first website logger.warning("AI selection failed, using first website") return websites[:1], f"AI selection failed, fallback to first website. AI response: {response}" except Exception as e: logger.error(f"Error in website selection: {str(e)}") return websites[:1], f"Error in website selection: {str(e)}" async def analyzeContentWithChunking(self, allContent: Dict[str, str], userQuestion: str) -> str: """Analyze content using AI with chunking for large content.""" logger.info(f"Analyzing {len(allContent)} websites with AI") # Process content in chunks to avoid token limits chunkSize = 50000 # 50k chars per chunk allChunks = [] for url, content in allContent.items(): filteredContent = self._filterContent(content) if len(filteredContent) <= chunkSize: allChunks.append((url, filteredContent)) logger.info(f"Content from {url}: {len(filteredContent)} chars (single chunk)") else: # Split large content into chunks chunkCount = (len(filteredContent) + chunkSize - 1) // chunkSize logger.info(f"Content from {url}: {len(filteredContent)} chars (split into {chunkCount} chunks)") for i in range(0, len(filteredContent), chunkSize): chunk = filteredContent[i:i+chunkSize] chunkNum = i//chunkSize + 1 allChunks.append((f"{url} (part {chunkNum})", chunk)) logger.info(f"Processing {len(allChunks)} content chunks") # Analyze each chunk chunkAnalyses = [] for i, (url, chunk) in enumerate(allChunks, 1): logger.info(f"Analyzing chunk {i}/{len(allChunks)}: {url}") try: analysisPrompt = f""" Analyze this web content and extract relevant information for: {userQuestion} Source: {url} Content: {chunk} Please extract key information relevant to the query. """ analysis = await self.webQuery(analysisPrompt) chunkAnalyses.append(analysis) logger.info(f"Chunk {i}/{len(allChunks)} analyzed successfully") except Exception as e: logger.error(f"Chunk {i}/{len(allChunks)} error: {e}") # Combine all chunk analyses if chunkAnalyses: logger.info(f"Combining {len(chunkAnalyses)} chunk analyses") combinedAnalysis = "\n\n".join(chunkAnalyses) # Final synthesis try: logger.info("Performing final synthesis of all analyses") synthesisPrompt = f""" Based on these partial analyses, provide a comprehensive answer to: {userQuestion} Partial analyses: {combinedAnalysis} Please provide a clear, well-structured answer to the query. """ finalAnalysis = await self.webQuery(synthesisPrompt) logger.info("Final synthesis completed successfully") return finalAnalysis except Exception as e: logger.error(f"Synthesis error: {e}") return combinedAnalysis else: logger.error("No content could be analyzed") return "No content could be analyzed" def _filterContent(self, content: str) -> str: """Filter out navigation, ads, and other nonsense content.""" lines = content.split('\n') filteredLines = [] for line in lines: line = line.strip() # Skip empty lines if not line: continue # Skip navigation elements if any(skip in line.lower() for skip in [ 'toggle navigation', 'log in', 'sign up', 'cookies', 'privacy policy', 'terms of service', 'subscribe', 'newsletter', 'follow us', 'share this', 'advertisement', 'sponsored', 'banner', 'popup', 'modal' ]): continue # Skip image references without context if line.startswith('![Image') and '](' in line: continue # Skip pure links without context if line.startswith('[') and line.endswith(')') and '---' in line: continue # Keep meaningful content if len(line) > 10: # Skip very short lines filteredLines.append(line) return '\n'.join(filteredLines)