import logging
from typing import Dict, Any, List, Union, Tuple, Optional
from dataclasses import dataclass
logger = logging.getLogger(__name__)
from modules.connectors.connectorAiOpenai import AiOpenai
from modules.connectors.connectorAiAnthropic import AiAnthropic
from modules.connectors.connectorAiPerplexity import AiPerplexity
from modules.connectors.connectorAiTavily import ConnectorWeb
from modules.datamodels.datamodelAi import (
AiCallOptions,
AiCallRequest,
AiCallResponse,
OperationType,
ProcessingMode,
Priority,
ModelTags,
OPERATION_TAG_MAPPING,
PROCESSING_MODE_PRIORITY_MAPPING
)
from modules.datamodels.datamodelWeb import (
WebResearchRequest,
WebResearchActionResult,
WebSearchResultItem,
WebCrawlResultItem,
WebSearchRequest,
WebCrawlRequest,
)
from modules.datamodels.datamodelWorkflow import ActionDocument
logger = logging.getLogger(__name__)
# Comprehensive model registry with capability tags and function mapping
aiModels: Dict[str, Dict[str, Any]] = {
# OpenAI Models
"openai_callAiBasic": {
"connector": "openai",
"function": "callAiBasic",
"llmName": "gpt-4o",
"contextLength": 128000,
"costPer1kTokens": 0.03,
"costPer1kTokensOutput": 0.06,
"speedRating": 8,
"qualityRating": 9,
"capabilities": ["text_generation", "chat", "reasoning"],
"tags": ["text", "chat", "reasoning", "general"]
},
"openai_callAiBasic_gpt35": {
"connector": "openai",
"function": "callAiBasic",
"llmName": "gpt-3.5-turbo",
"contextLength": 16000,
"costPer1kTokens": 0.0015,
"costPer1kTokensOutput": 0.002,
"speedRating": 9,
"qualityRating": 7,
"capabilities": ["text_generation", "chat", "reasoning"],
"tags": ["text", "chat", "reasoning", "general", "fast"]
},
"openai_callAiImage": {
"connector": "openai",
"function": "callAiImage",
"llmName": "gpt-4o",
"contextLength": 128000,
"costPer1kTokens": 0.03,
"costPer1kTokensOutput": 0.06,
"speedRating": 7,
"qualityRating": 9,
"capabilities": ["image_analysis", "vision", "multimodal"],
"tags": ["image", "vision", "multimodal"]
},
"openai_generateImage": {
"connector": "openai",
"function": "generateImage",
"llmName": "dall-e-3",
"contextLength": 0,
"costPer1kTokens": 0.04,
"costPer1kTokensOutput": 0.0,
"speedRating": 6,
"qualityRating": 9,
"capabilities": ["image_generation", "art", "visual_creation"],
"tags": ["image_generation", "art", "visual"]
},
# Anthropic Models
"anthropic_callAiBasic": {
"connector": "anthropic",
"function": "callAiBasic",
"llmName": "claude-3-5-sonnet-20241022",
"contextLength": 200000,
"costPer1kTokens": 0.015,
"costPer1kTokensOutput": 0.075,
"speedRating": 7,
"qualityRating": 10,
"capabilities": ["text_generation", "chat", "reasoning", "analysis"],
"tags": ["text", "chat", "reasoning", "analysis", "high_quality"]
},
"anthropic_callAiImage": {
"connector": "anthropic",
"function": "callAiImage",
"llmName": "claude-3-5-sonnet-20241022",
"contextLength": 200000,
"costPer1kTokens": 0.015,
"costPer1kTokensOutput": 0.075,
"speedRating": 7,
"qualityRating": 10,
"capabilities": ["image_analysis", "vision", "multimodal"],
"tags": ["image", "vision", "multimodal", "high_quality"]
},
# Perplexity Models
"perplexity_callAiBasic": {
"connector": "perplexity",
"function": "callAiBasic",
"llmName": "llama-3.1-sonar-large-128k-online",
"contextLength": 128000,
"costPer1kTokens": 0.005,
"costPer1kTokensOutput": 0.005,
"speedRating": 8,
"qualityRating": 8,
"capabilities": ["text_generation", "chat", "reasoning", "web_search"],
"tags": ["text", "chat", "reasoning", "web_search", "cost_effective"]
},
"perplexity_callAiWithWebSearch": {
"connector": "perplexity",
"function": "callAiWithWebSearch",
"llmName": "sonar-pro",
"contextLength": 128000,
"costPer1kTokens": 0.01,
"costPer1kTokensOutput": 0.01,
"speedRating": 7,
"qualityRating": 9,
"capabilities": ["text_generation", "web_search", "research"],
"tags": ["text", "web_search", "research", "high_quality"]
},
"perplexity_researchTopic": {
"connector": "perplexity",
"function": "researchTopic",
"llmName": "mistral-7b-instruct",
"contextLength": 32000,
"costPer1kTokens": 0.002,
"costPer1kTokensOutput": 0.002,
"speedRating": 8,
"qualityRating": 8,
"capabilities": ["web_search", "research", "information_gathering"],
"tags": ["web_search", "research", "information", "cost_effective"]
},
"perplexity_answerQuestion": {
"connector": "perplexity",
"function": "answerQuestion",
"llmName": "mistral-7b-instruct",
"contextLength": 32000,
"costPer1kTokens": 0.002,
"costPer1kTokensOutput": 0.002,
"speedRating": 8,
"qualityRating": 8,
"capabilities": ["web_search", "question_answering", "research"],
"tags": ["web_search", "qa", "research", "cost_effective"]
},
"perplexity_getCurrentNews": {
"connector": "perplexity",
"function": "getCurrentNews",
"llmName": "mistral-7b-instruct",
"contextLength": 32000,
"costPer1kTokens": 0.002,
"costPer1kTokensOutput": 0.002,
"speedRating": 8,
"qualityRating": 8,
"capabilities": ["web_search", "news", "current_events"],
"tags": ["web_search", "news", "current_events", "cost_effective"]
},
# Tavily Web Models
"tavily_search": {
"connector": "tavily",
"function": "search",
"llmName": "tavily-search",
"contextLength": 0,
"costPer1kTokens": 0.0,
"costPer1kTokensOutput": 0.0,
"speedRating": 8,
"qualityRating": 8,
"capabilities": ["web_search", "information_retrieval", "url_discovery"],
"tags": ["web", "search", "urls", "information"]
},
"tavily_crawl": {
"connector": "tavily",
"function": "crawl",
"llmName": "tavily-extract",
"contextLength": 0,
"costPer1kTokens": 0.0,
"costPer1kTokensOutput": 0.0,
"speedRating": 6,
"qualityRating": 8,
"capabilities": ["web_crawling", "content_extraction", "text_extraction"],
"tags": ["web", "crawl", "extract", "content"]
},
"tavily_scrape": {
"connector": "tavily",
"function": "scrape",
"llmName": "tavily-search-extract",
"contextLength": 0,
"costPer1kTokens": 0.0,
"costPer1kTokensOutput": 0.0,
"speedRating": 6,
"qualityRating": 8,
"capabilities": ["web_search", "web_crawling", "content_extraction", "information_retrieval"],
"tags": ["web", "search", "crawl", "extract", "content", "information"]
}
}
@dataclass(slots=True)
class AiObjects:
"""Centralized AI interface: selects model and calls connector. Includes web functionality."""
openaiService: AiOpenai
anthropicService: AiAnthropic
perplexityService: AiPerplexity
tavilyService: ConnectorWeb
def __post_init__(self) -> None:
if self.openaiService is None:
raise TypeError("openaiService must be provided")
if self.anthropicService is None:
raise TypeError("anthropicService must be provided")
if self.perplexityService is None:
raise TypeError("perplexityService must be provided")
if self.tavilyService is None:
raise TypeError("tavilyService must be provided")
@classmethod
async def create(cls) -> "AiObjects":
"""Create AiObjects instance with all connectors initialized."""
openaiService = AiOpenai()
anthropicService = AiAnthropic()
perplexityService = AiPerplexity()
tavilyService = await ConnectorWeb.create()
return cls(
openaiService=openaiService,
anthropicService=anthropicService,
perplexityService=perplexityService,
tavilyService=tavilyService
)
def _estimateCost(self, modelInfo: Dict[str, Any], contentSize: int) -> float:
estimatedTokens = contentSize / 4
inputCost = (estimatedTokens / 1000) * modelInfo["costPer1kTokens"]
outputCost = (estimatedTokens / 1000) * modelInfo["costPer1kTokensOutput"] * 0.1
return inputCost + outputCost
def _selectModel(self, prompt: str, context: str, options: AiCallOptions) -> str:
"""Select the best model based on operation type, tags, and requirements."""
totalSize = len(prompt.encode("utf-8")) + len(context.encode("utf-8"))
candidates: Dict[str, Dict[str, Any]] = {}
# Determine required tags from operation type
requiredTags = options.requiredTags
if not requiredTags:
requiredTags = OPERATION_TAG_MAPPING.get(options.operationType, [ModelTags.TEXT, ModelTags.CHAT])
# Override priority based on processing mode if not explicitly set
effectivePriority = options.priority
if options.priority == Priority.BALANCED:
effectivePriority = PROCESSING_MODE_PRIORITY_MAPPING.get(options.processingMode, Priority.BALANCED)
logger.info(f"Model selection - Operation: {options.operationType}, Required tags: {requiredTags}, Priority: {effectivePriority}")
for name, info in aiModels.items():
# Check context length
if info["contextLength"] > 0 and totalSize > info["contextLength"] * 0.8:
continue
# Check cost constraints
if options.maxCost is not None:
if self._estimateCost(info, totalSize) > options.maxCost:
continue
# Check required tags/capabilities
modelTags = info.get("tags", [])
if requiredTags and not any(tag in modelTags for tag in requiredTags):
continue
# Check processing mode requirements
if options.processingMode == ProcessingMode.DETAILED and ModelTags.FAST in modelTags:
# Skip fast models for detailed processing
continue
candidates[name] = info
if not candidates:
# Fallback based on operation type
if options.operationType == OperationType.IMAGE_ANALYSIS:
return "openai_callAiImage"
elif options.operationType == OperationType.IMAGE_GENERATION:
return "openai_generateImage"
elif options.operationType == OperationType.WEB_RESEARCH:
return "perplexity_callAiWithWebSearch"
else:
return "openai_callAiBasic_gpt35"
# Select based on priority
if effectivePriority == Priority.SPEED:
return max(candidates, key=lambda k: candidates[k]["speedRating"])
elif effectivePriority == Priority.QUALITY:
return max(candidates, key=lambda k: candidates[k]["qualityRating"])
elif effectivePriority == Priority.COST:
return min(candidates, key=lambda k: candidates[k]["costPer1kTokens"])
else: # BALANCED
def balancedScore(name: str) -> float:
info = candidates[name]
return info["qualityRating"] * 0.4 + info["speedRating"] * 0.3 + (10 - info["costPer1kTokens"] * 1000) * 0.3
return max(candidates, key=balancedScore)
def _connectorFor(self, modelName: str):
"""Get the appropriate connector for the model."""
connectorType = aiModels[modelName]["connector"]
if connectorType == "openai":
return self.openaiService
elif connectorType == "anthropic":
return self.anthropicService
elif connectorType == "perplexity":
return self.perplexityService
elif connectorType == "tavily":
return self.tavilyService
else:
raise ValueError(f"Unknown connector type: {connectorType}")
async def call(self, request: AiCallRequest) -> AiCallResponse:
"""Call AI model for text generation."""
prompt = request.prompt
context = request.context or ""
options = request.options
# Compress optionally (prompt/context) - simple truncation fallback kept here
def maybeTruncate(text: str, limit: int) -> str:
data = text.encode("utf-8")
if len(data) <= limit:
return text
return data[:limit].decode("utf-8", errors="ignore") + "... [truncated]"
if options.compressPrompt and len(prompt.encode("utf-8")) > 2000:
prompt = maybeTruncate(prompt, 2000)
if options.compressContext and len(context.encode("utf-8")) > 70000:
context = maybeTruncate(context, 70000)
# Select model for text generation
modelName = self._selectModel(prompt, context, options)
messages: List[Dict[str, Any]] = []
if context:
messages.append({"role": "system", "content": f"Context from documents:\n{context}"})
messages.append({"role": "user", "content": prompt})
connector = self._connectorFor(modelName)
functionName = aiModels[modelName]["function"]
# Call the appropriate function
if functionName == "callAiBasic":
if aiModels[modelName]["connector"] == "openai":
content = await connector.callAiBasic(messages)
elif aiModels[modelName]["connector"] == "perplexity":
content = await connector.callAiBasic(messages)
else:
response = await connector.callAiBasic(messages)
content = response["choices"][0]["message"]["content"]
elif functionName == "callAiWithWebSearch":
# Perplexity web search function
query = prompt
if context:
query = f"Context: {context}\n\nQuery: {prompt}"
content = await connector.callAiWithWebSearch(query)
elif functionName == "researchTopic":
# Perplexity research function
content = await connector.researchTopic(prompt)
elif functionName == "answerQuestion":
# Perplexity question answering function
content = await connector.answerQuestion(prompt, context)
elif functionName == "getCurrentNews":
# Perplexity news function
content = await connector.getCurrentNews(prompt)
else:
raise ValueError(f"Function {functionName} not supported for text generation")
# Estimate cost/tokens
totalSize = len((prompt + context).encode("utf-8"))
cost = self._estimateCost(aiModels[modelName], totalSize)
usedTokens = int(totalSize / 4)
return AiCallResponse(content=content, modelName=modelName, usedTokens=usedTokens, costEstimate=cost)
async def callImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None, options: AiCallOptions = None) -> str:
"""Call AI model for image analysis."""
if options is None:
options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS)
# Select model for image analysis
modelName = self._selectModel(prompt, "", options)
connector = self._connectorFor(modelName)
functionName = aiModels[modelName]["function"]
if functionName == "callAiImage":
return await connector.callAiImage(prompt, imageData, mimeType)
else:
raise ValueError(f"Function {functionName} not supported for image analysis")
async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> Dict[str, Any]:
"""Generate an image using AI."""
if options is None:
options = AiCallOptions(operationType=OperationType.IMAGE_GENERATION)
# Select model for image generation
modelName = self._selectModel(prompt, "", options)
connector = self._connectorFor(modelName)
functionName = aiModels[modelName]["function"]
if functionName == "generateImage":
return await connector.generateImage(prompt, size, quality, style)
elif functionName == "generateImageWithVariations":
results = await connector.generateImageWithVariations(prompt, 1, size, quality, style)
return results[0] if results else {}
elif functionName == "generateImageWithChat":
content = await connector.generateImageWithChat(prompt, size, quality, style)
return {"content": content, "success": True}
else:
raise ValueError(f"Function {functionName} not supported for image generation")
# Web functionality methods - Simple interface to Tavily connector
async def search_websites(self, query: str, max_results: int = 5, **kwargs) -> List[WebSearchResultItem]:
"""Search for websites using Tavily."""
request = WebSearchRequest(
query=query,
max_results=max_results,
**kwargs
)
result = await self.tavilyService.search(request)
if result.success and result.documents:
return result.documents[0].documentData.results
return []
async def crawl_websites(self, urls: List[str], extract_depth: str = "advanced", format: str = "markdown") -> List[WebCrawlResultItem]:
"""Crawl websites using Tavily."""
from pydantic import HttpUrl
from urllib.parse import urlparse
# Safely create HttpUrl objects with proper scheme handling
http_urls = []
for url in urls:
try:
# Ensure URL has a scheme
parsed = urlparse(url)
if not parsed.scheme:
url = f"https://{url}"
# Use HttpUrl with scheme parameter (this works for all URLs)
http_urls.append(HttpUrl(url, scheme="https"))
except Exception as e:
logger.warning(f"Skipping invalid URL {url}: {e}")
continue
if not http_urls:
return []
request = WebCrawlRequest(
urls=http_urls,
extract_depth=extract_depth,
format=format
)
result = await self.tavilyService.crawl(request)
if result.success and result.documents:
return result.documents[0].documentData.results
return []
async def extract_content(self, urls: List[str], extract_depth: str = "advanced", format: str = "markdown") -> Dict[str, str]:
"""Extract content from URLs and return as dictionary."""
crawl_results = await self.crawl_websites(urls, extract_depth, format)
return {str(result.url): result.content for result in crawl_results}
# Core Web Tools - Clean interface for web operations
async def readPage(self, url: str, extract_depth: str = "advanced") -> Optional[str]:
"""Read a single web page and return its content (HTML/Markdown)."""
logger.debug(f"Reading page: {url}")
try:
# URL encode the URL to handle spaces and special characters
from urllib.parse import quote, urlparse, urlunparse
parsed = urlparse(url)
encoded_url = urlunparse((
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
parsed.query,
parsed.fragment
))
# Manually encode query parameters to handle spaces
if parsed.query:
encoded_query = quote(parsed.query, safe='=&')
encoded_url = urlunparse((
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
encoded_query,
parsed.fragment
))
logger.debug(f"URL encoded: {url} -> {encoded_url}")
content = await self.extract_content([encoded_url], extract_depth, "markdown")
result = content.get(encoded_url)
if result:
logger.debug(f"Successfully read page {encoded_url}: {len(result)} chars")
else:
logger.warning(f"No content returned for page {encoded_url}")
return result
except Exception as e:
logger.warning(f"Failed to read page {url}: {e}")
return None
async def getUrlsFromPage(self, url: str, extract_depth: str = "advanced") -> List[str]:
"""Get all URLs from a web page, with redundancies removed."""
try:
content = await self.readPage(url, extract_depth)
if not content:
return []
links = self._extractLinksFromContent(content, url)
# Remove duplicates while preserving order
seen = set()
unique_links = []
for link in links:
if link not in seen:
seen.add(link)
unique_links.append(link)
logger.debug(f"Extracted {len(unique_links)} unique URLs from {url}")
return unique_links
except Exception as e:
logger.warning(f"Failed to get URLs from page {url}: {e}")
return []
def filterUrlsOnlyPages(self, urls: List[str], max_per_domain: int = 10) -> List[str]:
"""Filter URLs to get only links for pages to follow (no images, etc.)."""
from urllib.parse import urlparse
def _isHtmlCandidate(url: str) -> bool:
lower = url.lower()
blocked = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp',
'.mp4', '.mp3', '.avi', '.mov', '.mkv',
'.pdf', '.zip', '.rar', '.7z', '.tar', '.gz',
'.css', '.js', '.woff', '.woff2', '.ttf', '.eot')
return not lower.endswith(blocked)
# Group by domain
domain_links = {}
for link in urls:
domain = urlparse(link).netloc
if domain not in domain_links:
domain_links[domain] = []
domain_links[domain].append(link)
# Filter and cap per domain
filtered_links = []
for domain, domain_link_list in domain_links.items():
seen = set()
domain_filtered = []
for link in domain_link_list:
if link in seen:
continue
if not _isHtmlCandidate(link):
continue
seen.add(link)
domain_filtered.append(link)
if len(domain_filtered) >= max_per_domain:
break
filtered_links.extend(domain_filtered)
logger.debug(f"Domain {domain}: {len(domain_link_list)} -> {len(domain_filtered)} links")
return filtered_links
def _extractLinksFromContent(self, content: str, base_url: str) -> List[str]:
"""Extract links from HTML/Markdown content."""
try:
import re
from urllib.parse import urljoin, urlparse
links = []
# Extract HTML links: format
html_link_pattern = r']+href=["\']([^"\']+)["\'][^>]*>'
html_links = re.findall(html_link_pattern, content, re.IGNORECASE)
for url in html_links:
if url and (url.startswith('http://') or url.startswith('https://')):
links.append(url)
elif url and not url.startswith('#') and not url.startswith('javascript:'):
# Convert relative URLs to absolute URLs
absolute_url = urljoin(base_url, url)
links.append(absolute_url)
# Extract markdown links: [text](url) format
markdown_link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
markdown_links = re.findall(markdown_link_pattern, content)
for text, url in markdown_links:
if url and (url.startswith('http://') or url.startswith('https://')):
absolute_url = urljoin(base_url, url)
# Only keep URLs from the same domain
if urlparse(absolute_url).netloc == urlparse(base_url).netloc:
links.append(absolute_url)
elif url and not url.startswith('#'):
absolute_url = urljoin(base_url, url)
if urlparse(absolute_url).netloc == urlparse(base_url).netloc:
links.append(absolute_url)
# Extract plain URLs in the text
url_pattern = r'https?://[^\s\)]+'
plain_urls = re.findall(url_pattern, content)
for url in plain_urls:
clean_url = url.rstrip('.,;!?')
absolute_url = urljoin(base_url, clean_url)
if urlparse(absolute_url).netloc == urlparse(base_url).netloc:
if absolute_url not in links: # Avoid duplicates
links.append(absolute_url)
return links
except Exception as e:
logger.warning(f"Failed to extract links from content: {e}")
return []
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10) -> Dict[str, str]:
"""
Recursively crawl URLs up to specified depth.
Args:
urls: List of starting URLs to crawl
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
extract_depth: Tavily extract depth setting
max_per_domain: Maximum URLs per domain per level
Returns:
Dictionary mapping URL -> content for all crawled pages
"""
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
# URL index to track all processed URLs
processed_urls = set()
all_content = {}
# Current level URLs to process
current_level_urls = urls.copy()
for depth in range(1, max_depth + 1):
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
# URLs found at this level (for next iteration)
next_level_urls = []
for url in current_level_urls:
if url in processed_urls:
logger.debug(f"URL {url} already processed, skipping")
continue
try:
logger.info(f"Processing URL at depth {depth}: {url}")
# Read page content
content = await self.readPage(url, extract_depth)
if content:
all_content[url] = content
processed_urls.add(url)
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
# Get URLs from this page for next level
page_urls = await self.getUrlsFromPage(url, extract_depth)
logger.info(f"Found {len(page_urls)} URLs on {url}")
# Filter URLs and add to next level
filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
logger.info(f"Filtered to {len(filtered_urls)} valid URLs")
# Add new URLs to next level (avoiding already processed ones)
new_urls_count = 0
for new_url in filtered_urls:
if new_url not in processed_urls:
next_level_urls.append(new_url)
new_urls_count += 1
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
else:
logger.warning(f"✗ No content extracted from {url}")
processed_urls.add(url) # Mark as processed to avoid retry
except Exception as e:
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
processed_urls.add(url) # Mark as processed to avoid retry
# Prepare for next iteration
current_level_urls = next_level_urls
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
# Stop if no more URLs to process
if not current_level_urls:
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
break
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
return all_content
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str:
"""Use Perplexity AI to provide the best answers for web-related queries."""
if options is None:
options = AiCallOptions(operationType=OperationType.WEB_RESEARCH)
# Create a comprehensive prompt for web queries
webPrompt = f"""You are an expert web researcher and information analyst. Please provide a comprehensive and accurate answer to the following web-related query.
Query: {query}
{f"Additional Context: {context}" if context else ""}
Please provide:
1. A clear, well-structured answer to the query
2. Key points and important details
3. Relevant insights and analysis
4. Any important considerations or caveats
5. Suggestions for further research if applicable
Format your response in a clear, professional manner that would be helpful for someone researching this topic."""
try:
# Use Perplexity for web research with search capabilities
response = await self.perplexityService.callAiWithWebSearch(webPrompt)
return response
except Exception as e:
logger.error(f"Perplexity web query failed: {str(e)}")
raise Exception(f"Failed to process web query: {str(e)}")
# Utility methods
async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]:
"""List available models, optionally filtered by connector type."""
if connectorType:
return [info for name, info in aiModels.items() if info["connector"] == connectorType]
return list(aiModels.values())
async def getModelInfo(self, modelName: str) -> Dict[str, Any]:
"""Get information about a specific model."""
if modelName not in aiModels:
raise ValueError(f"Model {modelName} not found")
return aiModels[modelName]
async def getModelsByCapability(self, capability: str) -> List[str]:
"""Get model names that support a specific capability."""
return [name for name, info in aiModels.items() if capability in info.get("capabilities", [])]
async def getModelsByTag(self, tag: str) -> List[str]:
"""Get model names that have a specific tag."""
return [name for name, info in aiModels.items() if tag in info.get("tags", [])]
async def selectRelevantWebsites(self, websites: List[str], userQuestion: str) -> Tuple[List[str], str]:
"""Select most relevant websites using AI analysis. Returns (selected_websites, ai_response)."""
if len(websites) <= 1:
return websites, "Only one website available, no selection needed"
try:
# Create website summaries for AI analysis
websiteSummaries = []
for i, url in enumerate(websites, 1):
from urllib.parse import urlparse
domain = urlparse(url).netloc
summary = f"{i}. {url} (Domain: {domain})"
websiteSummaries.append(summary)
selectionPrompt = f"""
Based on this user request: "{userQuestion}"
I have {len(websites)} websites found. Please select the most relevant website(s) for this request.
Available websites:
{chr(10).join(websiteSummaries)}
Please respond with the website number(s) (1, 2, 3, etc.) that are most relevant.
Format: 1,3,5 (or just 1 for single selection)
"""
# Use Perplexity to select the best websites
response = await self.webQuery(selectionPrompt)
# Parse the selection
import re
numbers = re.findall(r'\d+', response)
if numbers:
selectedWebsites = []
for num in numbers:
index = int(num) - 1
if 0 <= index < len(websites):
selectedWebsites.append(websites[index])
if selectedWebsites:
logger.info(f"AI selected {len(selectedWebsites)} websites")
return selectedWebsites, response
# Fallback to first website
logger.warning("AI selection failed, using first website")
return websites[:1], f"AI selection failed, fallback to first website. AI response: {response}"
except Exception as e:
logger.error(f"Error in website selection: {str(e)}")
return websites[:1], f"Error in website selection: {str(e)}"
async def analyzeContentWithChunking(self, allContent: Dict[str, str], userQuestion: str) -> str:
"""Analyze content using AI with chunking for large content."""
logger.info(f"Analyzing {len(allContent)} websites with AI")
# Process content in chunks to avoid token limits
chunkSize = 50000 # 50k chars per chunk
allChunks = []
for url, content in allContent.items():
filteredContent = self._filterContent(content)
if len(filteredContent) <= chunkSize:
allChunks.append((url, filteredContent))
logger.info(f"Content from {url}: {len(filteredContent)} chars (single chunk)")
else:
# Split large content into chunks
chunkCount = (len(filteredContent) + chunkSize - 1) // chunkSize
logger.info(f"Content from {url}: {len(filteredContent)} chars (split into {chunkCount} chunks)")
for i in range(0, len(filteredContent), chunkSize):
chunk = filteredContent[i:i+chunkSize]
chunkNum = i//chunkSize + 1
allChunks.append((f"{url} (part {chunkNum})", chunk))
logger.info(f"Processing {len(allChunks)} content chunks")
# Analyze each chunk
chunkAnalyses = []
for i, (url, chunk) in enumerate(allChunks, 1):
logger.info(f"Analyzing chunk {i}/{len(allChunks)}: {url}")
try:
analysisPrompt = f"""
Analyze this web content and extract relevant information for: {userQuestion}
Source: {url}
Content: {chunk}
Please extract key information relevant to the query.
"""
analysis = await self.webQuery(analysisPrompt)
chunkAnalyses.append(analysis)
logger.info(f"Chunk {i}/{len(allChunks)} analyzed successfully")
except Exception as e:
logger.error(f"Chunk {i}/{len(allChunks)} error: {e}")
# Combine all chunk analyses
if chunkAnalyses:
logger.info(f"Combining {len(chunkAnalyses)} chunk analyses")
combinedAnalysis = "\n\n".join(chunkAnalyses)
# Final synthesis
try:
logger.info("Performing final synthesis of all analyses")
synthesisPrompt = f"""
Based on these partial analyses, provide a comprehensive answer to: {userQuestion}
Partial analyses:
{combinedAnalysis}
Please provide a clear, well-structured answer to the query.
"""
finalAnalysis = await self.webQuery(synthesisPrompt)
logger.info("Final synthesis completed successfully")
return finalAnalysis
except Exception as e:
logger.error(f"Synthesis error: {e}")
return combinedAnalysis
else:
logger.error("No content could be analyzed")
return "No content could be analyzed"
def _filterContent(self, content: str) -> str:
"""Filter out navigation, ads, and other nonsense content."""
lines = content.split('\n')
filteredLines = []
for line in lines:
line = line.strip()
# Skip empty lines
if not line:
continue
# Skip navigation elements
if any(skip in line.lower() for skip in [
'toggle navigation', 'log in', 'sign up', 'cookies', 'privacy policy',
'terms of service', 'subscribe', 'newsletter', 'follow us', 'share this',
'advertisement', 'sponsored', 'banner', 'popup', 'modal'
]):
continue
# Skip image references without context
if line.startswith(' and line.endswith(')') and '---' in line:
continue
# Keep meaningful content
if len(line) > 10: # Skip very short lines
filteredLines.append(line)
return '\n'.join(filteredLines)