958 lines
40 KiB
Python
958 lines
40 KiB
Python
import logging
|
|
from typing import Dict, Any, List, Union, Tuple, Optional
|
|
from dataclasses import dataclass
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
from modules.connectors.connectorAiOpenai import AiOpenai
|
|
from modules.connectors.connectorAiAnthropic import AiAnthropic
|
|
from modules.connectors.connectorAiPerplexity import AiPerplexity
|
|
from modules.connectors.connectorAiTavily import ConnectorWeb
|
|
from modules.datamodels.datamodelAi import (
|
|
AiCallOptions,
|
|
AiCallRequest,
|
|
AiCallResponse,
|
|
OperationType,
|
|
ProcessingMode,
|
|
Priority,
|
|
ModelTags,
|
|
OPERATION_TAG_MAPPING,
|
|
PROCESSING_MODE_PRIORITY_MAPPING
|
|
)
|
|
from modules.datamodels.datamodelWeb import (
|
|
WebResearchRequest,
|
|
WebResearchActionResult,
|
|
WebSearchResultItem,
|
|
WebCrawlResultItem,
|
|
WebSearchRequest,
|
|
WebCrawlRequest,
|
|
)
|
|
from modules.datamodels.datamodelWorkflow import ActionDocument
|
|
|
|
|
|
# Comprehensive model registry with capability tags and function mapping
|
|
aiModels: Dict[str, Dict[str, Any]] = {
|
|
# OpenAI Models
|
|
"openai_callAiBasic": {
|
|
"connector": "openai",
|
|
"function": "callAiBasic",
|
|
"llmName": "gpt-4o",
|
|
"contextLength": 128000,
|
|
"costPer1kTokens": 0.03,
|
|
"costPer1kTokensOutput": 0.06,
|
|
"speedRating": 8,
|
|
"qualityRating": 9,
|
|
"capabilities": ["text_generation", "chat", "reasoning"],
|
|
"tags": ["text", "chat", "reasoning", "general"]
|
|
},
|
|
"openai_callAiBasic_gpt35": {
|
|
"connector": "openai",
|
|
"function": "callAiBasic",
|
|
"llmName": "gpt-3.5-turbo",
|
|
"contextLength": 16000,
|
|
"costPer1kTokens": 0.0015,
|
|
"costPer1kTokensOutput": 0.002,
|
|
"speedRating": 9,
|
|
"qualityRating": 7,
|
|
"capabilities": ["text_generation", "chat", "reasoning"],
|
|
"tags": ["text", "chat", "reasoning", "general", "fast"]
|
|
},
|
|
"openai_callAiImage": {
|
|
"connector": "openai",
|
|
"function": "callAiImage",
|
|
"llmName": "gpt-4o",
|
|
"contextLength": 128000,
|
|
"costPer1kTokens": 0.03,
|
|
"costPer1kTokensOutput": 0.06,
|
|
"speedRating": 7,
|
|
"qualityRating": 9,
|
|
"capabilities": ["image_analysis", "vision", "multimodal"],
|
|
"tags": ["image", "vision", "multimodal"]
|
|
},
|
|
"openai_generateImage": {
|
|
"connector": "openai",
|
|
"function": "generateImage",
|
|
"llmName": "dall-e-3",
|
|
"contextLength": 0,
|
|
"costPer1kTokens": 0.04,
|
|
"costPer1kTokensOutput": 0.0,
|
|
"speedRating": 6,
|
|
"qualityRating": 9,
|
|
"capabilities": ["image_generation", "art", "visual_creation"],
|
|
"tags": ["image_generation", "art", "visual"]
|
|
},
|
|
|
|
# Anthropic Models
|
|
"anthropic_callAiBasic": {
|
|
"connector": "anthropic",
|
|
"function": "callAiBasic",
|
|
"llmName": "claude-3-5-sonnet-20241022",
|
|
"contextLength": 200000,
|
|
"costPer1kTokens": 0.015,
|
|
"costPer1kTokensOutput": 0.075,
|
|
"speedRating": 7,
|
|
"qualityRating": 10,
|
|
"capabilities": ["text_generation", "chat", "reasoning", "analysis"],
|
|
"tags": ["text", "chat", "reasoning", "analysis", "high_quality"]
|
|
},
|
|
"anthropic_callAiImage": {
|
|
"connector": "anthropic",
|
|
"function": "callAiImage",
|
|
"llmName": "claude-3-5-sonnet-20241022",
|
|
"contextLength": 200000,
|
|
"costPer1kTokens": 0.015,
|
|
"costPer1kTokensOutput": 0.075,
|
|
"speedRating": 7,
|
|
"qualityRating": 10,
|
|
"capabilities": ["image_analysis", "vision", "multimodal"],
|
|
"tags": ["image", "vision", "multimodal", "high_quality"]
|
|
},
|
|
|
|
# Perplexity Models
|
|
"perplexity_callAiBasic": {
|
|
"connector": "perplexity",
|
|
"function": "callAiBasic",
|
|
"llmName": "llama-3.1-sonar-large-128k-online",
|
|
"contextLength": 128000,
|
|
"costPer1kTokens": 0.005,
|
|
"costPer1kTokensOutput": 0.005,
|
|
"speedRating": 8,
|
|
"qualityRating": 8,
|
|
"capabilities": ["text_generation", "chat", "reasoning", "web_search"],
|
|
"tags": ["text", "chat", "reasoning", "web_search", "cost_effective"]
|
|
},
|
|
"perplexity_callAiWithWebSearch": {
|
|
"connector": "perplexity",
|
|
"function": "callAiWithWebSearch",
|
|
"llmName": "sonar-pro",
|
|
"contextLength": 128000,
|
|
"costPer1kTokens": 0.01,
|
|
"costPer1kTokensOutput": 0.01,
|
|
"speedRating": 7,
|
|
"qualityRating": 9,
|
|
"capabilities": ["text_generation", "web_search", "research"],
|
|
"tags": ["text", "web_search", "research", "high_quality"]
|
|
},
|
|
"perplexity_researchTopic": {
|
|
"connector": "perplexity",
|
|
"function": "researchTopic",
|
|
"llmName": "mistral-7b-instruct",
|
|
"contextLength": 32000,
|
|
"costPer1kTokens": 0.002,
|
|
"costPer1kTokensOutput": 0.002,
|
|
"speedRating": 8,
|
|
"qualityRating": 8,
|
|
"capabilities": ["web_search", "research", "information_gathering"],
|
|
"tags": ["web_search", "research", "information", "cost_effective"]
|
|
},
|
|
"perplexity_answerQuestion": {
|
|
"connector": "perplexity",
|
|
"function": "answerQuestion",
|
|
"llmName": "mistral-7b-instruct",
|
|
"contextLength": 32000,
|
|
"costPer1kTokens": 0.002,
|
|
"costPer1kTokensOutput": 0.002,
|
|
"speedRating": 8,
|
|
"qualityRating": 8,
|
|
"capabilities": ["web_search", "question_answering", "research"],
|
|
"tags": ["web_search", "qa", "research", "cost_effective"]
|
|
},
|
|
"perplexity_getCurrentNews": {
|
|
"connector": "perplexity",
|
|
"function": "getCurrentNews",
|
|
"llmName": "mistral-7b-instruct",
|
|
"contextLength": 32000,
|
|
"costPer1kTokens": 0.002,
|
|
"costPer1kTokensOutput": 0.002,
|
|
"speedRating": 8,
|
|
"qualityRating": 8,
|
|
"capabilities": ["web_search", "news", "current_events"],
|
|
"tags": ["web_search", "news", "current_events", "cost_effective"]
|
|
},
|
|
|
|
# Tavily Web Models
|
|
"tavily_search": {
|
|
"connector": "tavily",
|
|
"function": "search",
|
|
"llmName": "tavily-search",
|
|
"contextLength": 0,
|
|
"costPer1kTokens": 0.0,
|
|
"costPer1kTokensOutput": 0.0,
|
|
"speedRating": 8,
|
|
"qualityRating": 8,
|
|
"capabilities": ["web_search", "information_retrieval", "url_discovery"],
|
|
"tags": ["web", "search", "urls", "information"]
|
|
},
|
|
"tavily_crawl": {
|
|
"connector": "tavily",
|
|
"function": "crawl",
|
|
"llmName": "tavily-extract",
|
|
"contextLength": 0,
|
|
"costPer1kTokens": 0.0,
|
|
"costPer1kTokensOutput": 0.0,
|
|
"speedRating": 6,
|
|
"qualityRating": 8,
|
|
"capabilities": ["web_crawling", "content_extraction", "text_extraction"],
|
|
"tags": ["web", "crawl", "extract", "content"]
|
|
},
|
|
"tavily_scrape": {
|
|
"connector": "tavily",
|
|
"function": "scrape",
|
|
"llmName": "tavily-search-extract",
|
|
"contextLength": 0,
|
|
"costPer1kTokens": 0.0,
|
|
"costPer1kTokensOutput": 0.0,
|
|
"speedRating": 6,
|
|
"qualityRating": 8,
|
|
"capabilities": ["web_search", "web_crawling", "content_extraction", "information_retrieval"],
|
|
"tags": ["web", "search", "crawl", "extract", "content", "information"]
|
|
}
|
|
}
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class AiObjects:
|
|
"""Centralized AI interface: selects model and calls connector. Includes web functionality."""
|
|
|
|
openaiService: AiOpenai
|
|
anthropicService: AiAnthropic
|
|
perplexityService: AiPerplexity
|
|
tavilyService: ConnectorWeb
|
|
|
|
def __post_init__(self) -> None:
|
|
if self.openaiService is None:
|
|
raise TypeError("openaiService must be provided")
|
|
if self.anthropicService is None:
|
|
raise TypeError("anthropicService must be provided")
|
|
if self.perplexityService is None:
|
|
raise TypeError("perplexityService must be provided")
|
|
if self.tavilyService is None:
|
|
raise TypeError("tavilyService must be provided")
|
|
|
|
@classmethod
|
|
async def create(cls) -> "AiObjects":
|
|
"""Create AiObjects instance with all connectors initialized."""
|
|
openaiService = AiOpenai()
|
|
anthropicService = AiAnthropic()
|
|
perplexityService = AiPerplexity()
|
|
tavilyService = await ConnectorWeb.create()
|
|
|
|
return cls(
|
|
openaiService=openaiService,
|
|
anthropicService=anthropicService,
|
|
perplexityService=perplexityService,
|
|
tavilyService=tavilyService
|
|
)
|
|
|
|
def _estimateCost(self, modelInfo: Dict[str, Any], contentSize: int) -> float:
|
|
estimatedTokens = contentSize / 4
|
|
inputCost = (estimatedTokens / 1000) * modelInfo["costPer1kTokens"]
|
|
outputCost = (estimatedTokens / 1000) * modelInfo["costPer1kTokensOutput"] * 0.1
|
|
return inputCost + outputCost
|
|
|
|
def _selectModel(self, prompt: str, context: str, options: AiCallOptions) -> str:
|
|
"""Select the best model based on operation type, tags, and requirements."""
|
|
totalSize = len(prompt.encode("utf-8")) + len(context.encode("utf-8"))
|
|
candidates: Dict[str, Dict[str, Any]] = {}
|
|
|
|
# Determine required tags from operation type
|
|
requiredTags = options.requiredTags
|
|
if not requiredTags:
|
|
requiredTags = OPERATION_TAG_MAPPING.get(options.operationType, [ModelTags.TEXT, ModelTags.CHAT])
|
|
|
|
# Override priority based on processing mode if not explicitly set
|
|
effectivePriority = options.priority
|
|
if options.priority == Priority.BALANCED:
|
|
effectivePriority = PROCESSING_MODE_PRIORITY_MAPPING.get(options.processingMode, Priority.BALANCED)
|
|
|
|
logger.info(f"Model selection - Operation: {options.operationType}, Required tags: {requiredTags}, Priority: {effectivePriority}")
|
|
|
|
for name, info in aiModels.items():
|
|
# Check context length
|
|
if info["contextLength"] > 0 and totalSize > info["contextLength"] * 0.8:
|
|
continue
|
|
|
|
# Check cost constraints
|
|
if options.maxCost is not None:
|
|
if self._estimateCost(info, totalSize) > options.maxCost:
|
|
continue
|
|
|
|
# Check required tags/capabilities
|
|
modelTags = info.get("tags", [])
|
|
if requiredTags and not any(tag in modelTags for tag in requiredTags):
|
|
continue
|
|
|
|
# Check processing mode requirements
|
|
if options.processingMode == ProcessingMode.DETAILED and ModelTags.FAST in modelTags:
|
|
# Skip fast models for detailed processing
|
|
continue
|
|
|
|
candidates[name] = info
|
|
|
|
if not candidates:
|
|
# Fallback based on operation type
|
|
if options.operationType == OperationType.IMAGE_ANALYSIS:
|
|
return "openai_callAiImage"
|
|
elif options.operationType == OperationType.IMAGE_GENERATION:
|
|
return "openai_generateImage"
|
|
elif options.operationType == OperationType.WEB_RESEARCH:
|
|
return "perplexity_callAiWithWebSearch"
|
|
else:
|
|
return "openai_callAiBasic_gpt35"
|
|
|
|
# Select based on priority
|
|
if effectivePriority == Priority.SPEED:
|
|
return max(candidates, key=lambda k: candidates[k]["speedRating"])
|
|
elif effectivePriority == Priority.QUALITY:
|
|
return max(candidates, key=lambda k: candidates[k]["qualityRating"])
|
|
elif effectivePriority == Priority.COST:
|
|
return min(candidates, key=lambda k: candidates[k]["costPer1kTokens"])
|
|
else: # BALANCED
|
|
def balancedScore(name: str) -> float:
|
|
info = candidates[name]
|
|
return info["qualityRating"] * 0.4 + info["speedRating"] * 0.3 + (10 - info["costPer1kTokens"] * 1000) * 0.3
|
|
|
|
return max(candidates, key=balancedScore)
|
|
|
|
def _connectorFor(self, modelName: str):
|
|
"""Get the appropriate connector for the model."""
|
|
connectorType = aiModels[modelName]["connector"]
|
|
if connectorType == "openai":
|
|
return self.openaiService
|
|
elif connectorType == "anthropic":
|
|
return self.anthropicService
|
|
elif connectorType == "perplexity":
|
|
return self.perplexityService
|
|
elif connectorType == "tavily":
|
|
return self.tavilyService
|
|
else:
|
|
raise ValueError(f"Unknown connector type: {connectorType}")
|
|
|
|
async def call(self, request: AiCallRequest) -> AiCallResponse:
|
|
"""Call AI model for text generation."""
|
|
prompt = request.prompt
|
|
context = request.context or ""
|
|
options = request.options
|
|
|
|
# Compress optionally (prompt/context) - simple truncation fallback kept here
|
|
def maybeTruncate(text: str, limit: int) -> str:
|
|
data = text.encode("utf-8")
|
|
if len(data) <= limit:
|
|
return text
|
|
return data[:limit].decode("utf-8", errors="ignore") + "... [truncated]"
|
|
|
|
if options.compressPrompt and len(prompt.encode("utf-8")) > 2000:
|
|
prompt = maybeTruncate(prompt, 2000)
|
|
if options.compressContext and len(context.encode("utf-8")) > 70000:
|
|
context = maybeTruncate(context, 70000)
|
|
|
|
# Select model for text generation
|
|
modelName = self._selectModel(prompt, context, options)
|
|
|
|
messages: List[Dict[str, Any]] = []
|
|
if context:
|
|
messages.append({"role": "system", "content": f"Context from documents:\n{context}"})
|
|
messages.append({"role": "user", "content": prompt})
|
|
|
|
connector = self._connectorFor(modelName)
|
|
functionName = aiModels[modelName]["function"]
|
|
|
|
# Call the appropriate function
|
|
if functionName == "callAiBasic":
|
|
if aiModels[modelName]["connector"] == "openai":
|
|
content = await connector.callAiBasic(messages)
|
|
elif aiModels[modelName]["connector"] == "perplexity":
|
|
content = await connector.callAiBasic(messages)
|
|
else:
|
|
response = await connector.callAiBasic(messages)
|
|
content = response["choices"][0]["message"]["content"]
|
|
elif functionName == "callAiWithWebSearch":
|
|
# Perplexity web search function
|
|
query = prompt
|
|
if context:
|
|
query = f"Context: {context}\n\nQuery: {prompt}"
|
|
content = await connector.callAiWithWebSearch(query)
|
|
elif functionName == "researchTopic":
|
|
# Perplexity research function
|
|
content = await connector.researchTopic(prompt)
|
|
elif functionName == "answerQuestion":
|
|
# Perplexity question answering function
|
|
content = await connector.answerQuestion(prompt, context)
|
|
elif functionName == "getCurrentNews":
|
|
# Perplexity news function
|
|
content = await connector.getCurrentNews(prompt)
|
|
else:
|
|
raise ValueError(f"Function {functionName} not supported for text generation")
|
|
|
|
# Estimate cost/tokens
|
|
totalSize = len((prompt + context).encode("utf-8"))
|
|
cost = self._estimateCost(aiModels[modelName], totalSize)
|
|
usedTokens = int(totalSize / 4)
|
|
|
|
return AiCallResponse(content=content, modelName=modelName, usedTokens=usedTokens, costEstimate=cost)
|
|
|
|
async def callImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None, options: AiCallOptions = None) -> str:
|
|
"""Call AI model for image analysis."""
|
|
if options is None:
|
|
options = AiCallOptions(operationType=OperationType.IMAGE_ANALYSIS)
|
|
|
|
# Select model for image analysis
|
|
modelName = self._selectModel(prompt, "", options)
|
|
|
|
connector = self._connectorFor(modelName)
|
|
functionName = aiModels[modelName]["function"]
|
|
|
|
if functionName == "callAiImage":
|
|
return await connector.callAiImage(prompt, imageData, mimeType)
|
|
else:
|
|
raise ValueError(f"Function {functionName} not supported for image analysis")
|
|
|
|
async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> Dict[str, Any]:
|
|
"""Generate an image using AI."""
|
|
if options is None:
|
|
options = AiCallOptions(operationType=OperationType.IMAGE_GENERATION)
|
|
|
|
# Select model for image generation
|
|
modelName = self._selectModel(prompt, "", options)
|
|
|
|
connector = self._connectorFor(modelName)
|
|
functionName = aiModels[modelName]["function"]
|
|
|
|
if functionName == "generateImage":
|
|
return await connector.generateImage(prompt, size, quality, style)
|
|
elif functionName == "generateImageWithVariations":
|
|
results = await connector.generateImageWithVariations(prompt, 1, size, quality, style)
|
|
return results[0] if results else {}
|
|
elif functionName == "generateImageWithChat":
|
|
content = await connector.generateImageWithChat(prompt, size, quality, style)
|
|
return {"content": content, "success": True}
|
|
else:
|
|
raise ValueError(f"Function {functionName} not supported for image generation")
|
|
|
|
# Web functionality methods - Simple interface to Tavily connector
|
|
async def search_websites(self, query: str, max_results: int = 5, **kwargs) -> List[WebSearchResultItem]:
|
|
"""Search for websites using Tavily."""
|
|
request = WebSearchRequest(
|
|
query=query,
|
|
max_results=max_results,
|
|
**kwargs
|
|
)
|
|
result = await self.tavilyService.search(request)
|
|
|
|
if result.success and result.documents:
|
|
return result.documents[0].documentData.results
|
|
return []
|
|
|
|
async def crawl_websites(self, urls: List[str], extract_depth: str = "advanced", format: str = "markdown") -> List[WebCrawlResultItem]:
|
|
"""Crawl websites using Tavily."""
|
|
from pydantic import HttpUrl
|
|
from urllib.parse import urlparse
|
|
|
|
# Safely create HttpUrl objects with proper scheme handling
|
|
http_urls = []
|
|
for url in urls:
|
|
try:
|
|
# Ensure URL has a scheme
|
|
parsed = urlparse(url)
|
|
if not parsed.scheme:
|
|
url = f"https://{url}"
|
|
|
|
# Use HttpUrl with scheme parameter (this works for all URLs)
|
|
http_urls.append(HttpUrl(url, scheme="https"))
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Skipping invalid URL {url}: {e}")
|
|
continue
|
|
|
|
if not http_urls:
|
|
return []
|
|
|
|
request = WebCrawlRequest(
|
|
urls=http_urls,
|
|
extract_depth=extract_depth,
|
|
format=format
|
|
)
|
|
result = await self.tavilyService.crawl(request)
|
|
|
|
if result.success and result.documents:
|
|
return result.documents[0].documentData.results
|
|
return []
|
|
|
|
async def extract_content(self, urls: List[str], extract_depth: str = "advanced", format: str = "markdown") -> Dict[str, str]:
|
|
"""Extract content from URLs and return as dictionary."""
|
|
crawl_results = await self.crawl_websites(urls, extract_depth, format)
|
|
return {str(result.url): result.content for result in crawl_results}
|
|
|
|
# Core Web Tools - Clean interface for web operations
|
|
async def readPage(self, url: str, extract_depth: str = "advanced") -> Optional[str]:
|
|
"""Read a single web page and return its content (HTML/Markdown)."""
|
|
logger.debug(f"Reading page: {url}")
|
|
try:
|
|
# URL encode the URL to handle spaces and special characters
|
|
from urllib.parse import quote, urlparse, urlunparse
|
|
parsed = urlparse(url)
|
|
encoded_url = urlunparse((
|
|
parsed.scheme,
|
|
parsed.netloc,
|
|
parsed.path,
|
|
parsed.params,
|
|
parsed.query,
|
|
parsed.fragment
|
|
))
|
|
|
|
# Manually encode query parameters to handle spaces
|
|
if parsed.query:
|
|
encoded_query = quote(parsed.query, safe='=&')
|
|
encoded_url = urlunparse((
|
|
parsed.scheme,
|
|
parsed.netloc,
|
|
parsed.path,
|
|
parsed.params,
|
|
encoded_query,
|
|
parsed.fragment
|
|
))
|
|
|
|
logger.debug(f"URL encoded: {url} -> {encoded_url}")
|
|
|
|
content = await self.extract_content([encoded_url], extract_depth, "markdown")
|
|
result = content.get(encoded_url)
|
|
if result:
|
|
logger.debug(f"Successfully read page {encoded_url}: {len(result)} chars")
|
|
else:
|
|
logger.warning(f"No content returned for page {encoded_url}")
|
|
return result
|
|
except Exception as e:
|
|
logger.warning(f"Failed to read page {url}: {e}")
|
|
return None
|
|
|
|
async def getUrlsFromPage(self, url: str, extract_depth: str = "advanced") -> List[str]:
|
|
"""Get all URLs from a web page, with redundancies removed."""
|
|
try:
|
|
content = await self.readPage(url, extract_depth)
|
|
if not content:
|
|
return []
|
|
|
|
links = self._extractLinksFromContent(content, url)
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_links = []
|
|
for link in links:
|
|
if link not in seen:
|
|
seen.add(link)
|
|
unique_links.append(link)
|
|
|
|
logger.debug(f"Extracted {len(unique_links)} unique URLs from {url}")
|
|
return unique_links
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to get URLs from page {url}: {e}")
|
|
return []
|
|
|
|
def filterUrlsOnlyPages(self, urls: List[str], max_per_domain: int = 10) -> List[str]:
|
|
"""Filter URLs to get only links for pages to follow (no images, etc.)."""
|
|
from urllib.parse import urlparse
|
|
|
|
def _isHtmlCandidate(url: str) -> bool:
|
|
lower = url.lower()
|
|
blocked = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp',
|
|
'.mp4', '.mp3', '.avi', '.mov', '.mkv',
|
|
'.pdf', '.zip', '.rar', '.7z', '.tar', '.gz',
|
|
'.css', '.js', '.woff', '.woff2', '.ttf', '.eot')
|
|
return not lower.endswith(blocked)
|
|
|
|
# Group by domain
|
|
domain_links = {}
|
|
for link in urls:
|
|
domain = urlparse(link).netloc
|
|
if domain not in domain_links:
|
|
domain_links[domain] = []
|
|
domain_links[domain].append(link)
|
|
|
|
# Filter and cap per domain
|
|
filtered_links = []
|
|
for domain, domain_link_list in domain_links.items():
|
|
seen = set()
|
|
domain_filtered = []
|
|
|
|
for link in domain_link_list:
|
|
if link in seen:
|
|
continue
|
|
if not _isHtmlCandidate(link):
|
|
continue
|
|
seen.add(link)
|
|
domain_filtered.append(link)
|
|
if len(domain_filtered) >= max_per_domain:
|
|
break
|
|
|
|
filtered_links.extend(domain_filtered)
|
|
logger.debug(f"Domain {domain}: {len(domain_link_list)} -> {len(domain_filtered)} links")
|
|
|
|
return filtered_links
|
|
|
|
def _extractLinksFromContent(self, content: str, base_url: str) -> List[str]:
|
|
"""Extract links from HTML/Markdown content."""
|
|
try:
|
|
import re
|
|
from urllib.parse import urljoin, urlparse, quote, urlunparse
|
|
|
|
def _cleanUrl(url: str) -> str:
|
|
"""Clean and encode URL to remove spaces and invalid characters."""
|
|
# Remove quotes and extra spaces
|
|
url = url.strip().strip('"\'')
|
|
|
|
# If it's a relative URL, make it absolute first
|
|
if not url.startswith(('http://', 'https://')):
|
|
url = urljoin(base_url, url)
|
|
|
|
# Parse and re-encode the URL properly
|
|
parsed = urlparse(url)
|
|
if parsed.query:
|
|
# Encode query parameters properly
|
|
encoded_query = quote(parsed.query, safe='=&')
|
|
url = urlunparse((
|
|
parsed.scheme,
|
|
parsed.netloc,
|
|
parsed.path,
|
|
parsed.params,
|
|
encoded_query,
|
|
parsed.fragment
|
|
))
|
|
|
|
return url
|
|
|
|
links = []
|
|
|
|
# Extract HTML links: <a href="url"> format
|
|
html_link_pattern = r'<a[^>]+href=["\']([^"\']+)["\'][^>]*>'
|
|
html_links = re.findall(html_link_pattern, content, re.IGNORECASE)
|
|
|
|
for url in html_links:
|
|
if url and not url.startswith('#') and not url.startswith('javascript:'):
|
|
try:
|
|
cleaned_url = _cleanUrl(url)
|
|
links.append(cleaned_url)
|
|
logger.debug(f"Extracted HTML link: {url} -> {cleaned_url}")
|
|
except Exception as e:
|
|
logger.debug(f"Failed to clean HTML link {url}: {e}")
|
|
|
|
# Extract markdown links: [text](url) format
|
|
markdown_link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
|
markdown_links = re.findall(markdown_link_pattern, content)
|
|
|
|
for text, url in markdown_links:
|
|
if url and not url.startswith('#'):
|
|
try:
|
|
cleaned_url = _cleanUrl(url)
|
|
# Only keep URLs from the same domain
|
|
if urlparse(cleaned_url).netloc == urlparse(base_url).netloc:
|
|
links.append(cleaned_url)
|
|
logger.debug(f"Extracted markdown link: {url} -> {cleaned_url}")
|
|
except Exception as e:
|
|
logger.debug(f"Failed to clean markdown link {url}: {e}")
|
|
|
|
# Extract plain URLs in the text
|
|
url_pattern = r'https?://[^\s\)]+'
|
|
plain_urls = re.findall(url_pattern, content)
|
|
|
|
for url in plain_urls:
|
|
try:
|
|
clean_url = url.rstrip('.,;!?')
|
|
cleaned_url = _cleanUrl(clean_url)
|
|
if urlparse(cleaned_url).netloc == urlparse(base_url).netloc:
|
|
if cleaned_url not in links: # Avoid duplicates
|
|
links.append(cleaned_url)
|
|
logger.debug(f"Extracted plain URL: {url} -> {cleaned_url}")
|
|
except Exception as e:
|
|
logger.debug(f"Failed to clean plain URL {url}: {e}")
|
|
|
|
logger.debug(f"Total links extracted and cleaned: {len(links)}")
|
|
return links
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to extract links from content: {e}")
|
|
return []
|
|
|
|
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10) -> Dict[str, str]:
|
|
"""
|
|
Recursively crawl URLs up to specified depth.
|
|
|
|
Args:
|
|
urls: List of starting URLs to crawl
|
|
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
|
|
extract_depth: Tavily extract depth setting
|
|
max_per_domain: Maximum URLs per domain per level
|
|
|
|
Returns:
|
|
Dictionary mapping URL -> content for all crawled pages
|
|
"""
|
|
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
|
|
|
|
# URL index to track all processed URLs
|
|
processed_urls = set()
|
|
all_content = {}
|
|
|
|
# Current level URLs to process
|
|
current_level_urls = urls.copy()
|
|
|
|
for depth in range(1, max_depth + 1):
|
|
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
|
|
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
|
|
|
|
# URLs found at this level (for next iteration)
|
|
next_level_urls = []
|
|
|
|
for url in current_level_urls:
|
|
if url in processed_urls:
|
|
logger.debug(f"URL {url} already processed, skipping")
|
|
continue
|
|
|
|
try:
|
|
logger.info(f"Processing URL at depth {depth}: {url}")
|
|
|
|
# Read page content
|
|
content = await self.readPage(url, extract_depth)
|
|
if content:
|
|
all_content[url] = content
|
|
processed_urls.add(url)
|
|
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
|
|
|
|
# Get URLs from this page for next level
|
|
page_urls = await self.getUrlsFromPage(url, extract_depth)
|
|
logger.info(f"Found {len(page_urls)} URLs on {url}")
|
|
|
|
# Filter URLs and add to next level
|
|
filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
|
|
logger.info(f"Filtered to {len(filtered_urls)} valid URLs")
|
|
|
|
# Add new URLs to next level (avoiding already processed ones)
|
|
new_urls_count = 0
|
|
for new_url in filtered_urls:
|
|
if new_url not in processed_urls:
|
|
next_level_urls.append(new_url)
|
|
new_urls_count += 1
|
|
|
|
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
|
|
else:
|
|
logger.warning(f"✗ No content extracted from {url}")
|
|
processed_urls.add(url) # Mark as processed to avoid retry
|
|
|
|
except Exception as e:
|
|
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
|
|
processed_urls.add(url) # Mark as processed to avoid retry
|
|
|
|
# Prepare for next iteration
|
|
current_level_urls = next_level_urls
|
|
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
|
|
|
|
# Stop if no more URLs to process
|
|
if not current_level_urls:
|
|
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
|
|
break
|
|
|
|
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
|
return all_content
|
|
|
|
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str:
|
|
"""Use Perplexity AI to provide the best answers for web-related queries."""
|
|
if options is None:
|
|
options = AiCallOptions(operationType=OperationType.WEB_RESEARCH)
|
|
|
|
# Create a comprehensive prompt for web queries
|
|
webPrompt = f"""You are an expert web researcher and information analyst. Please provide a comprehensive and accurate answer to the following web-related query.
|
|
|
|
Query: {query}
|
|
|
|
{f"Additional Context: {context}" if context else ""}
|
|
|
|
Please provide:
|
|
1. A clear, well-structured answer to the query
|
|
2. Key points and important details
|
|
3. Relevant insights and analysis
|
|
4. Any important considerations or caveats
|
|
5. Suggestions for further research if applicable
|
|
|
|
Format your response in a clear, professional manner that would be helpful for someone researching this topic."""
|
|
|
|
try:
|
|
# Use Perplexity for web research with search capabilities
|
|
response = await self.perplexityService.callAiWithWebSearch(webPrompt)
|
|
return response
|
|
except Exception as e:
|
|
logger.error(f"Perplexity web query failed: {str(e)}")
|
|
raise Exception(f"Failed to process web query: {str(e)}")
|
|
|
|
# Utility methods
|
|
async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]:
|
|
"""List available models, optionally filtered by connector type."""
|
|
if connectorType:
|
|
return [info for name, info in aiModels.items() if info["connector"] == connectorType]
|
|
return list(aiModels.values())
|
|
|
|
async def getModelInfo(self, modelName: str) -> Dict[str, Any]:
|
|
"""Get information about a specific model."""
|
|
if modelName not in aiModels:
|
|
raise ValueError(f"Model {modelName} not found")
|
|
return aiModels[modelName]
|
|
|
|
async def getModelsByCapability(self, capability: str) -> List[str]:
|
|
"""Get model names that support a specific capability."""
|
|
return [name for name, info in aiModels.items() if capability in info.get("capabilities", [])]
|
|
|
|
async def getModelsByTag(self, tag: str) -> List[str]:
|
|
"""Get model names that have a specific tag."""
|
|
return [name for name, info in aiModels.items() if tag in info.get("tags", [])]
|
|
|
|
async def selectRelevantWebsites(self, websites: List[str], userQuestion: str) -> Tuple[List[str], str]:
|
|
"""Select most relevant websites using AI analysis. Returns (selected_websites, ai_response)."""
|
|
if len(websites) <= 1:
|
|
return websites, "Only one website available, no selection needed"
|
|
|
|
try:
|
|
# Create website summaries for AI analysis
|
|
websiteSummaries = []
|
|
for i, url in enumerate(websites, 1):
|
|
from urllib.parse import urlparse
|
|
domain = urlparse(url).netloc
|
|
summary = f"{i}. {url} (Domain: {domain})"
|
|
websiteSummaries.append(summary)
|
|
|
|
selectionPrompt = f"""
|
|
Based on this user request: "{userQuestion}"
|
|
|
|
I have {len(websites)} websites found. Please select the most relevant website(s) for this request.
|
|
|
|
Available websites:
|
|
{chr(10).join(websiteSummaries)}
|
|
|
|
Please respond with the website number(s) (1, 2, 3, etc.) that are most relevant.
|
|
Format: 1,3,5 (or just 1 for single selection)
|
|
"""
|
|
|
|
# Use Perplexity to select the best websites
|
|
response = await self.webQuery(selectionPrompt)
|
|
|
|
# Parse the selection
|
|
import re
|
|
numbers = re.findall(r'\d+', response)
|
|
if numbers:
|
|
selectedWebsites = []
|
|
for num in numbers:
|
|
index = int(num) - 1
|
|
if 0 <= index < len(websites):
|
|
selectedWebsites.append(websites[index])
|
|
|
|
if selectedWebsites:
|
|
logger.info(f"AI selected {len(selectedWebsites)} websites")
|
|
return selectedWebsites, response
|
|
|
|
# Fallback to first website
|
|
logger.warning("AI selection failed, using first website")
|
|
return websites[:1], f"AI selection failed, fallback to first website. AI response: {response}"
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in website selection: {str(e)}")
|
|
return websites[:1], f"Error in website selection: {str(e)}"
|
|
|
|
async def analyzeContentWithChunking(self, allContent: Dict[str, str], userQuestion: str) -> str:
|
|
"""Analyze content using AI with chunking for large content."""
|
|
logger.info(f"Analyzing {len(allContent)} websites with AI")
|
|
|
|
# Process content in chunks to avoid token limits
|
|
chunkSize = 50000 # 50k chars per chunk
|
|
allChunks = []
|
|
|
|
for url, content in allContent.items():
|
|
filteredContent = self._filterContent(content)
|
|
if len(filteredContent) <= chunkSize:
|
|
allChunks.append((url, filteredContent))
|
|
logger.info(f"Content from {url}: {len(filteredContent)} chars (single chunk)")
|
|
else:
|
|
# Split large content into chunks
|
|
chunkCount = (len(filteredContent) + chunkSize - 1) // chunkSize
|
|
logger.info(f"Content from {url}: {len(filteredContent)} chars (split into {chunkCount} chunks)")
|
|
for i in range(0, len(filteredContent), chunkSize):
|
|
chunk = filteredContent[i:i+chunkSize]
|
|
chunkNum = i//chunkSize + 1
|
|
allChunks.append((f"{url} (part {chunkNum})", chunk))
|
|
|
|
logger.info(f"Processing {len(allChunks)} content chunks")
|
|
|
|
# Analyze each chunk
|
|
chunkAnalyses = []
|
|
for i, (url, chunk) in enumerate(allChunks, 1):
|
|
logger.info(f"Analyzing chunk {i}/{len(allChunks)}: {url}")
|
|
|
|
try:
|
|
analysisPrompt = f"""
|
|
Analyze this web content and extract relevant information for: {userQuestion}
|
|
|
|
Source: {url}
|
|
Content: {chunk}
|
|
|
|
Please extract key information relevant to the query.
|
|
"""
|
|
|
|
analysis = await self.webQuery(analysisPrompt)
|
|
chunkAnalyses.append(analysis)
|
|
logger.info(f"Chunk {i}/{len(allChunks)} analyzed successfully")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Chunk {i}/{len(allChunks)} error: {e}")
|
|
|
|
# Combine all chunk analyses
|
|
if chunkAnalyses:
|
|
logger.info(f"Combining {len(chunkAnalyses)} chunk analyses")
|
|
combinedAnalysis = "\n\n".join(chunkAnalyses)
|
|
|
|
# Final synthesis
|
|
try:
|
|
logger.info("Performing final synthesis of all analyses")
|
|
synthesisPrompt = f"""
|
|
Based on these partial analyses, provide a comprehensive answer to: {userQuestion}
|
|
|
|
Partial analyses:
|
|
{combinedAnalysis}
|
|
|
|
Please provide a clear, well-structured answer to the query.
|
|
"""
|
|
|
|
finalAnalysis = await self.webQuery(synthesisPrompt)
|
|
logger.info("Final synthesis completed successfully")
|
|
return finalAnalysis
|
|
|
|
except Exception as e:
|
|
logger.error(f"Synthesis error: {e}")
|
|
return combinedAnalysis
|
|
else:
|
|
logger.error("No content could be analyzed")
|
|
return "No content could be analyzed"
|
|
|
|
def _filterContent(self, content: str) -> str:
|
|
"""Filter out navigation, ads, and other nonsense content."""
|
|
lines = content.split('\n')
|
|
filteredLines = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
# Skip empty lines
|
|
if not line:
|
|
continue
|
|
# Skip navigation elements
|
|
if any(skip in line.lower() for skip in [
|
|
'toggle navigation', 'log in', 'sign up', 'cookies', 'privacy policy',
|
|
'terms of service', 'subscribe', 'newsletter', 'follow us', 'share this',
|
|
'advertisement', 'sponsored', 'banner', 'popup', 'modal'
|
|
]):
|
|
continue
|
|
# Skip image references without context
|
|
if line.startswith(' and line.endswith(')') and '---' in line:
|
|
continue
|
|
# Keep meaningful content
|
|
if len(line) > 10: # Skip very short lines
|
|
filteredLines.append(line)
|
|
|
|
return '\n'.join(filteredLines)
|
|
|