end to end implementation of all ai models
This commit is contained in:
parent
daddf417be
commit
8d25ed6fc3
7 changed files with 1715 additions and 1095 deletions
|
|
@ -317,10 +317,17 @@ class AiOpenai(BaseConnectorAi):
|
|||
messages = modelCall.messages
|
||||
model = modelCall.model
|
||||
options = modelCall.options
|
||||
prompt = messages[0]["content"] if messages else ""
|
||||
size = options.get("size", "1024x1024")
|
||||
quality = options.get("quality", "standard")
|
||||
style = options.get("style", "vivid")
|
||||
|
||||
# Parse unified prompt JSON format
|
||||
promptContent = messages[0]["content"] if messages else ""
|
||||
import json
|
||||
promptData = json.loads(promptContent)
|
||||
|
||||
# Extract parameters from unified prompt JSON
|
||||
prompt = promptData.get("prompt", promptContent)
|
||||
size = promptData.get("size", "1024x1024")
|
||||
quality = promptData.get("quality", "standard")
|
||||
style = promptData.get("style", "vivid")
|
||||
|
||||
logger.debug(f"Starting image generation with prompt: '{prompt[:100]}...'")
|
||||
|
||||
|
|
|
|||
|
|
@ -80,7 +80,7 @@ class AiPerplexity(BaseConnectorAi):
|
|||
speedRating=6, # Slower due to AI analysis
|
||||
qualityRating=10, # Best AI analysis quality
|
||||
# capabilities removed (not used in business logic)
|
||||
functionCall=self.callAiWithWebSearch,
|
||||
functionCall=self.callWebOperation,
|
||||
priority=PriorityEnum.QUALITY,
|
||||
processingMode=ProcessingModeEnum.DETAILED,
|
||||
operationTypes=createOperationTypeRatings(
|
||||
|
|
@ -106,7 +106,7 @@ class AiPerplexity(BaseConnectorAi):
|
|||
speedRating=9, # Fast for basic AI tasks
|
||||
qualityRating=7, # Good but not premium quality
|
||||
# capabilities removed (not used in business logic)
|
||||
functionCall=self.researchTopic,
|
||||
functionCall=self.callWebOperation,
|
||||
priority=PriorityEnum.COST,
|
||||
processingMode=ProcessingModeEnum.BASIC,
|
||||
operationTypes=createOperationTypeRatings(
|
||||
|
|
@ -132,7 +132,7 @@ class AiPerplexity(BaseConnectorAi):
|
|||
speedRating=9, # Fast for Q&A tasks
|
||||
qualityRating=7, # Good but not premium quality
|
||||
# capabilities removed (not used in business logic)
|
||||
functionCall=self.answerQuestion,
|
||||
functionCall=self.callWebOperation,
|
||||
priority=PriorityEnum.COST,
|
||||
processingMode=ProcessingModeEnum.BASIC,
|
||||
operationTypes=createOperationTypeRatings(
|
||||
|
|
@ -158,7 +158,7 @@ class AiPerplexity(BaseConnectorAi):
|
|||
speedRating=9, # Fast for news tasks
|
||||
qualityRating=7, # Good but not premium quality
|
||||
# capabilities removed (not used in business logic)
|
||||
functionCall=self.getCurrentNews,
|
||||
functionCall=self.callWebOperation,
|
||||
priority=PriorityEnum.COST,
|
||||
processingMode=ProcessingModeEnum.BASIC,
|
||||
operationTypes=createOperationTypeRatings(
|
||||
|
|
@ -254,9 +254,48 @@ class AiPerplexity(BaseConnectorAi):
|
|||
temperature = options.get("temperature", model.temperature)
|
||||
maxTokens = model.maxTokens
|
||||
|
||||
# Parse unified prompt JSON format
|
||||
promptContent = messages[0]["content"] if messages else ""
|
||||
import json
|
||||
promptData = json.loads(promptContent)
|
||||
|
||||
# Create a more specific prompt for Perplexity based on the unified format
|
||||
searchPrompt = promptData.get("searchPrompt", promptContent)
|
||||
maxResults = promptData.get("maxResults", 5)
|
||||
timeRange = promptData.get("timeRange")
|
||||
country = promptData.get("country")
|
||||
language = promptData.get("language")
|
||||
|
||||
# Create enhanced prompt for Perplexity
|
||||
enhancedPrompt = f"""Search the web for: {searchPrompt}
|
||||
|
||||
Please provide a comprehensive response with relevant URLs and information.
|
||||
Focus on finding {maxResults} most relevant results.
|
||||
{f"Limit results to the last {timeRange}" if timeRange else ""}
|
||||
{f"Focus on {country}" if country else ""}
|
||||
{f"Provide results in {language}" if language else ""}
|
||||
|
||||
Please format your response as a JSON object with the following structure:
|
||||
{{
|
||||
"query": "{searchPrompt}",
|
||||
"results": [
|
||||
{{
|
||||
"title": "Result title",
|
||||
"url": "https://example.com",
|
||||
"content": "Brief description or excerpt"
|
||||
}}
|
||||
],
|
||||
"total_count": number_of_results
|
||||
}}
|
||||
|
||||
Include actual URLs in your response."""
|
||||
|
||||
# Update the messages with the enhanced prompt
|
||||
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
|
||||
|
||||
payload = {
|
||||
"model": model.name,
|
||||
"messages": messages,
|
||||
"messages": enhancedMessages,
|
||||
"temperature": temperature,
|
||||
"max_tokens": maxTokens
|
||||
}
|
||||
|
|
@ -472,6 +511,423 @@ class AiPerplexity(BaseConnectorAi):
|
|||
logger.error(f"Error getting current news: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Error getting current news: {str(e)}")
|
||||
|
||||
async def crawl(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||
"""
|
||||
Crawl URLs using Perplexity's web search capabilities for content extraction.
|
||||
|
||||
Args:
|
||||
modelCall: AiModelCall with messages and options
|
||||
|
||||
Returns:
|
||||
AiModelResponse with content and metadata
|
||||
"""
|
||||
try:
|
||||
# Extract parameters from modelCall
|
||||
messages = modelCall.messages
|
||||
model = modelCall.model
|
||||
options = modelCall.options
|
||||
temperature = options.get("temperature", model.temperature)
|
||||
maxTokens = model.maxTokens
|
||||
|
||||
# Parse unified prompt JSON format
|
||||
promptContent = messages[0]["content"] if messages else ""
|
||||
import json
|
||||
promptData = json.loads(promptContent)
|
||||
|
||||
# Extract parameters from unified prompt JSON
|
||||
urls = promptData.get("urls", [])
|
||||
extractDepth = promptData.get("extractDepth", "advanced")
|
||||
formatType = promptData.get("format", "markdown")
|
||||
|
||||
if not urls:
|
||||
return AiModelResponse(
|
||||
content="No URLs provided for crawling",
|
||||
success=False,
|
||||
error="No URLs found in prompt data"
|
||||
)
|
||||
|
||||
# Create enhanced prompt for Perplexity to crawl URLs
|
||||
urlsList = ", ".join(urls)
|
||||
enhancedPrompt = f"""Please extract and analyze content from these URLs: {urlsList}
|
||||
|
||||
Extraction requirements:
|
||||
- Extract depth: {extractDepth}
|
||||
- Output format: {formatType}
|
||||
- Focus on main content, not navigation or ads
|
||||
- Preserve important structure and formatting
|
||||
|
||||
Please format your response as a JSON object with the following structure:
|
||||
{{
|
||||
"urls": {json.dumps(urls)},
|
||||
"results": [
|
||||
{{
|
||||
"url": "https://example.com",
|
||||
"title": "Page title",
|
||||
"content": "Extracted content in {formatType} format",
|
||||
"extractedAt": "2024-01-01T00:00:00Z"
|
||||
}}
|
||||
],
|
||||
"total_count": number_of_urls_processed
|
||||
}}
|
||||
|
||||
Extract content from each URL and provide detailed analysis."""
|
||||
|
||||
# Update the messages with the enhanced prompt
|
||||
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
|
||||
|
||||
payload = {
|
||||
"model": model.name,
|
||||
"messages": enhancedMessages,
|
||||
"temperature": temperature,
|
||||
"max_tokens": maxTokens
|
||||
}
|
||||
|
||||
response = await self.httpClient.post(
|
||||
model.apiUrl,
|
||||
json=payload
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
error_detail = f"Perplexity Crawl API error: {response.status_code} - {response.text}"
|
||||
logger.error(error_detail)
|
||||
|
||||
if response.status_code == 429:
|
||||
error_message = "Rate limit exceeded for crawl. Please wait before making another request."
|
||||
elif response.status_code == 401:
|
||||
error_message = "Invalid API key for crawl. Please check your Perplexity API configuration."
|
||||
elif response.status_code == 400:
|
||||
error_message = f"Invalid request to Perplexity Crawl API: {response.text}"
|
||||
else:
|
||||
error_message = f"Perplexity Crawl API error ({response.status_code}): {response.text}"
|
||||
|
||||
raise HTTPException(status_code=500, detail=error_message)
|
||||
|
||||
responseJson = response.json()
|
||||
content = responseJson["choices"][0]["message"]["content"]
|
||||
|
||||
return AiModelResponse(
|
||||
content=content,
|
||||
success=True,
|
||||
modelId=model.name,
|
||||
metadata={"response_id": responseJson.get("id", "")}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Perplexity Crawl API: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Error calling Perplexity Crawl API: {str(e)}")
|
||||
|
||||
async def callWebOperation(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||
"""
|
||||
Universal web operation handler that distributes to the correct method
|
||||
based on the operationType from AiCallOptions.
|
||||
"""
|
||||
try:
|
||||
options = modelCall.options
|
||||
operationType = options.get("operationType")
|
||||
|
||||
if operationType == "WEB_SEARCH":
|
||||
return await self.callAiWithWebSearch(modelCall)
|
||||
elif operationType == "WEB_CRAWL":
|
||||
return await self.crawl(modelCall)
|
||||
elif operationType == "WEB_RESEARCH":
|
||||
return await self.research(modelCall)
|
||||
elif operationType == "WEB_QUESTIONS":
|
||||
return await self.questions(modelCall)
|
||||
elif operationType == "WEB_NEWS":
|
||||
return await self.news(modelCall)
|
||||
else:
|
||||
# Fallback to research for unknown operation types
|
||||
return await self.research(modelCall)
|
||||
|
||||
except Exception as e:
|
||||
return AiModelResponse(
|
||||
content="",
|
||||
success=False,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
async def research(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||
"""
|
||||
Research topics using Perplexity's web search capabilities.
|
||||
|
||||
Args:
|
||||
modelCall: AiModelCall with messages and options
|
||||
|
||||
Returns:
|
||||
AiModelResponse with research content and metadata
|
||||
"""
|
||||
try:
|
||||
# Extract parameters from modelCall
|
||||
messages = modelCall.messages
|
||||
model = modelCall.model
|
||||
options = modelCall.options
|
||||
temperature = options.get("temperature", model.temperature)
|
||||
maxTokens = model.maxTokens
|
||||
|
||||
# Parse unified prompt JSON format
|
||||
promptContent = messages[0]["content"] if messages else ""
|
||||
import json
|
||||
promptData = json.loads(promptContent)
|
||||
|
||||
# Extract parameters from unified prompt JSON
|
||||
researchPrompt = promptData.get("researchPrompt", promptContent)
|
||||
maxResults = promptData.get("maxResults", 8)
|
||||
timeRange = promptData.get("timeRange")
|
||||
country = promptData.get("country")
|
||||
language = promptData.get("language")
|
||||
|
||||
# Create enhanced prompt for research
|
||||
enhancedPrompt = f"""Conduct comprehensive research on: {researchPrompt}
|
||||
|
||||
Research requirements:
|
||||
- Provide detailed analysis and insights
|
||||
- Include multiple perspectives and sources
|
||||
- Focus on finding {maxResults} most relevant sources
|
||||
{f"Limit results to the last {timeRange}" if timeRange else ""}
|
||||
{f"Focus on {country}" if country else ""}
|
||||
{f"Provide results in {language}" if language else ""}
|
||||
|
||||
Please format your response as a JSON object with the following structure:
|
||||
{{
|
||||
"query": "{researchPrompt}",
|
||||
"research_results": [
|
||||
{{
|
||||
"title": "Source title",
|
||||
"url": "https://example.com",
|
||||
"summary": "Brief summary",
|
||||
"content": "Detailed content",
|
||||
"extractedAt": "2024-01-01T00:00:00Z"
|
||||
}}
|
||||
],
|
||||
"total_count": number_of_sources,
|
||||
"operation_type": "research"
|
||||
}}
|
||||
|
||||
Provide comprehensive research with detailed analysis."""
|
||||
|
||||
# Update the messages with the enhanced prompt
|
||||
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
|
||||
|
||||
payload = {
|
||||
"model": model.name,
|
||||
"messages": enhancedMessages,
|
||||
"temperature": temperature,
|
||||
"max_tokens": maxTokens
|
||||
}
|
||||
|
||||
response = await self.httpClient.post(
|
||||
model.apiUrl,
|
||||
json=payload
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
error_detail = f"Perplexity Research API error: {response.status_code} - {response.text}"
|
||||
logger.error(error_detail)
|
||||
raise HTTPException(status_code=500, detail=error_detail)
|
||||
|
||||
responseJson = response.json()
|
||||
content = responseJson["choices"][0]["message"]["content"]
|
||||
|
||||
return AiModelResponse(
|
||||
content=content,
|
||||
success=True,
|
||||
modelId=model.name,
|
||||
metadata={"response_id": responseJson.get("id", "")}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Perplexity Research API: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Error calling Perplexity Research API: {str(e)}")
|
||||
|
||||
async def questions(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||
"""
|
||||
Answer questions using Perplexity's web search capabilities.
|
||||
|
||||
Args:
|
||||
modelCall: AiModelCall with messages and options
|
||||
|
||||
Returns:
|
||||
AiModelResponse with answer and supporting sources
|
||||
"""
|
||||
try:
|
||||
# Extract parameters from modelCall
|
||||
messages = modelCall.messages
|
||||
model = modelCall.model
|
||||
options = modelCall.options
|
||||
temperature = options.get("temperature", model.temperature)
|
||||
maxTokens = model.maxTokens
|
||||
|
||||
# Parse unified prompt JSON format
|
||||
promptContent = messages[0]["content"] if messages else ""
|
||||
import json
|
||||
promptData = json.loads(promptContent)
|
||||
|
||||
# Extract parameters from unified prompt JSON
|
||||
question = promptData.get("question", promptContent)
|
||||
context = promptData.get("context", "")
|
||||
maxResults = promptData.get("maxResults", 6)
|
||||
timeRange = promptData.get("timeRange")
|
||||
country = promptData.get("country")
|
||||
language = promptData.get("language")
|
||||
|
||||
# Create enhanced prompt for questions
|
||||
contextText = f"\nAdditional context: {context}" if context else ""
|
||||
enhancedPrompt = f"""Answer this question using web research: {question}{contextText}
|
||||
|
||||
Answer requirements:
|
||||
- Provide a comprehensive answer with supporting evidence
|
||||
- Include {maxResults} most relevant sources
|
||||
- Cite sources with URLs
|
||||
{f"Focus on recent information (last {timeRange})" if timeRange else ""}
|
||||
{f"Focus on {country}" if country else ""}
|
||||
{f"Provide answer in {language}" if language else ""}
|
||||
|
||||
Please format your response as a JSON object with the following structure:
|
||||
{{
|
||||
"question": "{question}",
|
||||
"answer": "Comprehensive answer to the question",
|
||||
"answer_sources": [
|
||||
{{
|
||||
"title": "Source title",
|
||||
"url": "https://example.com",
|
||||
"summary": "Brief summary",
|
||||
"content": "Relevant content excerpt",
|
||||
"relevance": "Why this source is relevant"
|
||||
}}
|
||||
],
|
||||
"total_count": number_of_sources,
|
||||
"operation_type": "questions"
|
||||
}}
|
||||
|
||||
Provide a detailed answer with well-cited sources."""
|
||||
|
||||
# Update the messages with the enhanced prompt
|
||||
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
|
||||
|
||||
payload = {
|
||||
"model": model.name,
|
||||
"messages": enhancedMessages,
|
||||
"temperature": temperature,
|
||||
"max_tokens": maxTokens
|
||||
}
|
||||
|
||||
response = await self.httpClient.post(
|
||||
model.apiUrl,
|
||||
json=payload
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
error_detail = f"Perplexity Questions API error: {response.status_code} - {response.text}"
|
||||
logger.error(error_detail)
|
||||
raise HTTPException(status_code=500, detail=error_detail)
|
||||
|
||||
responseJson = response.json()
|
||||
content = responseJson["choices"][0]["message"]["content"]
|
||||
|
||||
return AiModelResponse(
|
||||
content=content,
|
||||
success=True,
|
||||
modelId=model.name,
|
||||
metadata={"response_id": responseJson.get("id", "")}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Perplexity Questions API: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Error calling Perplexity Questions API: {str(e)}")
|
||||
|
||||
async def news(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||
"""
|
||||
Search and analyze news using Perplexity's web search capabilities.
|
||||
|
||||
Args:
|
||||
modelCall: AiModelCall with messages and options
|
||||
|
||||
Returns:
|
||||
AiModelResponse with news articles and analysis
|
||||
"""
|
||||
try:
|
||||
# Extract parameters from modelCall
|
||||
messages = modelCall.messages
|
||||
model = modelCall.model
|
||||
options = modelCall.options
|
||||
temperature = options.get("temperature", model.temperature)
|
||||
maxTokens = model.maxTokens
|
||||
|
||||
# Parse unified prompt JSON format
|
||||
promptContent = messages[0]["content"] if messages else ""
|
||||
import json
|
||||
promptData = json.loads(promptContent)
|
||||
|
||||
# Extract parameters from unified prompt JSON
|
||||
newsPrompt = promptData.get("newsPrompt", promptContent)
|
||||
maxResults = promptData.get("maxResults", 10)
|
||||
timeRange = promptData.get("timeRange", "w") # Default to week for news
|
||||
country = promptData.get("country")
|
||||
language = promptData.get("language")
|
||||
|
||||
# Create enhanced prompt for news
|
||||
enhancedPrompt = f"""Find and analyze recent news about: {newsPrompt}
|
||||
|
||||
News requirements:
|
||||
- Find {maxResults} most recent and relevant news articles
|
||||
- Focus on the last {timeRange} (recent news)
|
||||
- Include diverse sources and perspectives
|
||||
{f"Focus on news from {country}" if country else ""}
|
||||
{f"Provide news in {language}" if language else ""}
|
||||
|
||||
Please format your response as a JSON object with the following structure:
|
||||
{{
|
||||
"news_query": "{newsPrompt}",
|
||||
"articles": [
|
||||
{{
|
||||
"title": "Article title",
|
||||
"url": "https://example.com",
|
||||
"content": "Article content",
|
||||
"date": "2024-01-01",
|
||||
"source": "News source name",
|
||||
"summary": "Brief summary of the article"
|
||||
}}
|
||||
],
|
||||
"total_count": number_of_articles,
|
||||
"operation_type": "news"
|
||||
}}
|
||||
|
||||
Provide comprehensive news coverage with analysis."""
|
||||
|
||||
# Update the messages with the enhanced prompt
|
||||
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
|
||||
|
||||
payload = {
|
||||
"model": model.name,
|
||||
"messages": enhancedMessages,
|
||||
"temperature": temperature,
|
||||
"max_tokens": maxTokens
|
||||
}
|
||||
|
||||
response = await self.httpClient.post(
|
||||
model.apiUrl,
|
||||
json=payload
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
error_detail = f"Perplexity News API error: {response.status_code} - {response.text}"
|
||||
logger.error(error_detail)
|
||||
raise HTTPException(status_code=500, detail=error_detail)
|
||||
|
||||
responseJson = response.json()
|
||||
content = responseJson["choices"][0]["message"]["content"]
|
||||
|
||||
return AiModelResponse(
|
||||
content=content,
|
||||
success=True,
|
||||
modelId=model.name,
|
||||
metadata={"response_id": responseJson.get("id", "")}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Perplexity News API: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Error calling Perplexity News API: {str(e)}")
|
||||
|
||||
async def _testConnection(self) -> bool:
|
||||
"""
|
||||
Tests the connection to the Perplexity API.
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ import logging
|
|||
import asyncio
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, Dict
|
||||
from tavily import AsyncTavilyClient
|
||||
from modules.shared.configuration import APP_CONFIG
|
||||
from modules.aicore.aicoreBase import BaseConnectorAi
|
||||
|
|
@ -88,6 +88,251 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
|
||||
return unique_urls
|
||||
|
||||
def _intelligentUrlFiltering(self, searchResults: List[WebSearchResult], query: str, maxResults: int) -> List[WebSearchResult]:
|
||||
"""
|
||||
Intelligent URL filtering with de-duplication and relevance scoring.
|
||||
|
||||
Args:
|
||||
searchResults: Raw search results from Tavily
|
||||
query: Original search query for relevance scoring
|
||||
maxResults: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
Filtered and deduplicated list of search results
|
||||
"""
|
||||
if not searchResults:
|
||||
return []
|
||||
|
||||
# Step 1: Basic de-duplication by URL
|
||||
seenUrls = set()
|
||||
uniqueResults = []
|
||||
|
||||
for result in searchResults:
|
||||
# Normalize URL for better deduplication
|
||||
normalizedUrl = self._normalizeUrl(result.url)
|
||||
if normalizedUrl not in seenUrls:
|
||||
seenUrls.add(normalizedUrl)
|
||||
uniqueResults.append(result)
|
||||
|
||||
logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original")
|
||||
|
||||
# Step 2: Relevance scoring and filtering
|
||||
scoredResults = []
|
||||
queryWords = set(query.lower().split())
|
||||
|
||||
for result in uniqueResults:
|
||||
score = self._calculateRelevanceScore(result, queryWords)
|
||||
scoredResults.append((score, result))
|
||||
|
||||
# Step 3: Sort by relevance score (higher is better)
|
||||
scoredResults.sort(key=lambda x: x[0], reverse=True)
|
||||
|
||||
# Step 4: Take top results
|
||||
filteredResults = [result for score, result in scoredResults[:maxResults]]
|
||||
|
||||
logger.info(f"After intelligent filtering: {len(filteredResults)} results selected from {len(uniqueResults)} unique")
|
||||
|
||||
return filteredResults
|
||||
|
||||
def _normalizeUrl(self, url: str) -> str:
|
||||
"""
|
||||
Normalize URL for better deduplication.
|
||||
Removes common variations that represent the same content.
|
||||
"""
|
||||
if not url:
|
||||
return url
|
||||
|
||||
# Remove trailing slashes
|
||||
url = url.rstrip('/')
|
||||
|
||||
# Remove common query parameters that don't affect content
|
||||
import urllib.parse
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
|
||||
# Remove common tracking parameters
|
||||
queryParams = urllib.parse.parse_qs(parsed.query)
|
||||
filteredParams = {}
|
||||
|
||||
for key, values in queryParams.items():
|
||||
# Keep important parameters, remove tracking ones
|
||||
if key.lower() not in ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
|
||||
'fbclid', 'gclid', 'ref', 'source', 'campaign']:
|
||||
filteredParams[key] = values
|
||||
|
||||
# Rebuild query string
|
||||
filteredQuery = urllib.parse.urlencode(filteredParams, doseq=True)
|
||||
|
||||
# Reconstruct URL
|
||||
normalized = urllib.parse.urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc,
|
||||
parsed.path,
|
||||
parsed.params,
|
||||
filteredQuery,
|
||||
parsed.fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
def _calculateRelevanceScore(self, result: WebSearchResult, queryWords: set) -> float:
|
||||
"""
|
||||
Calculate relevance score for a search result.
|
||||
Higher score means more relevant to the query.
|
||||
"""
|
||||
score = 0.0
|
||||
|
||||
# Title relevance (most important)
|
||||
titleWords = set(result.title.lower().split())
|
||||
titleMatches = len(queryWords.intersection(titleWords))
|
||||
score += titleMatches * 3.0 # Weight title matches heavily
|
||||
|
||||
# URL relevance
|
||||
urlWords = set(result.url.lower().split('/'))
|
||||
urlMatches = len(queryWords.intersection(urlWords))
|
||||
score += urlMatches * 1.5
|
||||
|
||||
# Content relevance (if available)
|
||||
if hasattr(result, 'raw_content') and result.raw_content:
|
||||
contentWords = set(result.raw_content.lower().split())
|
||||
contentMatches = len(queryWords.intersection(contentWords))
|
||||
score += contentMatches * 0.1 # Lower weight for content matches
|
||||
|
||||
# Domain authority bonus (simple heuristic)
|
||||
domain = result.url.split('/')[2] if '/' in result.url else result.url
|
||||
if any(auth_domain in domain.lower() for auth_domain in
|
||||
['wikipedia.org', 'github.com', 'stackoverflow.com', 'reddit.com', 'medium.com']):
|
||||
score += 1.0
|
||||
|
||||
# Penalty for very long URLs (often less relevant)
|
||||
if len(result.url) > 100:
|
||||
score -= 0.5
|
||||
|
||||
return score
|
||||
|
||||
async def _optimizeSearchQuery(self, query: str, timeRange: str = None, country: str = None, language: str = None) -> tuple[str, dict]:
|
||||
"""
|
||||
Use AI to optimize search query and parameters (from old SubWebResearch).
|
||||
|
||||
Args:
|
||||
query: Original search query
|
||||
timeRange: Time range filter
|
||||
country: Country filter
|
||||
language: Language filter
|
||||
|
||||
Returns:
|
||||
Tuple of (optimized_query, optimized_parameters)
|
||||
"""
|
||||
try:
|
||||
# Create AI prompt for query optimization (from old code)
|
||||
queryOptimizerPrompt = f"""You are a search query optimizer.
|
||||
|
||||
USER QUERY: {query}
|
||||
|
||||
Your task: Create a search query and parameters for the USER QUERY given.
|
||||
|
||||
RULES:
|
||||
1. The search query MUST be related to the user query above
|
||||
2. Extract key terms from the user query
|
||||
3. Determine appropriate country/language based on the query context
|
||||
4. Keep search query short (2-6 words)
|
||||
|
||||
Return ONLY this JSON format:
|
||||
{{
|
||||
"user_prompt": "search query based on user query above",
|
||||
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
|
||||
"language": "language_code_or_null",
|
||||
"topic": "general|news|academic_or_null",
|
||||
"time_range": "d|w|m|y_or_null",
|
||||
"selection_strategy": "single|multiple|specific_page",
|
||||
"selection_criteria": "what URLs to prioritize",
|
||||
"expected_url_patterns": ["pattern1", "pattern2"],
|
||||
"estimated_result_count": number
|
||||
}}"""
|
||||
|
||||
# Use AI to optimize the query
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
|
||||
aiRequest = AiCallRequest(
|
||||
prompt=queryOptimizerPrompt,
|
||||
options=AiCallOptions()
|
||||
)
|
||||
|
||||
# Get AI response (this would need to be called through the AI interface)
|
||||
# For now, return the original query with basic optimization
|
||||
logger.info(f"AI query optimization requested for: '{query}'")
|
||||
|
||||
# Basic optimization fallback
|
||||
optimizedQuery = query
|
||||
optimizedParams = {
|
||||
"time_range": timeRange,
|
||||
"country": country,
|
||||
"language": language,
|
||||
"topic": "general"
|
||||
}
|
||||
|
||||
return optimizedQuery, optimizedParams
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Query optimization failed: {str(e)}, using original query")
|
||||
return query, {"time_range": timeRange, "country": country, "language": language}
|
||||
|
||||
async def _aiBasedUrlSelection(self, searchResults: List[WebSearchResult], originalQuery: str, maxResults: int) -> List[WebSearchResult]:
|
||||
"""
|
||||
Use AI to select the most relevant URLs from search results (from old SubWebResearch).
|
||||
|
||||
Args:
|
||||
searchResults: Raw search results from Tavily
|
||||
originalQuery: Original user query for context
|
||||
maxResults: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
AI-selected and filtered list of search results
|
||||
"""
|
||||
try:
|
||||
if not searchResults:
|
||||
return []
|
||||
|
||||
# Step 1: Basic de-duplication
|
||||
seenUrls = set()
|
||||
uniqueResults = []
|
||||
|
||||
for result in searchResults:
|
||||
normalizedUrl = self._normalizeUrl(result.url)
|
||||
if normalizedUrl not in seenUrls:
|
||||
seenUrls.add(normalizedUrl)
|
||||
uniqueResults.append(result)
|
||||
|
||||
logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original")
|
||||
|
||||
if len(uniqueResults) <= maxResults:
|
||||
return uniqueResults
|
||||
|
||||
# Step 2: AI-based URL selection (from old code)
|
||||
logger.info(f"AI selecting most relevant {maxResults} URLs from {len(uniqueResults)} unique results")
|
||||
|
||||
# Create AI prompt for URL selection (from old code)
|
||||
urlList = "\n".join([f"{i+1}. {result.url}" for i, result in enumerate(uniqueResults)])
|
||||
aiPrompt = f"""Select the most relevant URLs from these search results:
|
||||
|
||||
{urlList}
|
||||
|
||||
Return only the URLs that are most relevant for the user's query: "{originalQuery}"
|
||||
One URL per line.
|
||||
"""
|
||||
|
||||
# For now, use intelligent filtering as fallback
|
||||
# In a full implementation, this would call the AI interface
|
||||
logger.info("Using intelligent filtering as AI selection fallback")
|
||||
|
||||
# Use the existing intelligent filtering
|
||||
filteredResults = self._intelligentUrlFiltering(uniqueResults, originalQuery, maxResults)
|
||||
|
||||
logger.info(f"AI-based selection completed: {len(filteredResults)} results selected")
|
||||
return filteredResults
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI-based URL selection failed: {str(e)}, using intelligent filtering")
|
||||
return self._intelligentUrlFiltering(searchResults, originalQuery, maxResults)
|
||||
|
||||
def getModels(self) -> List[AiModel]:
|
||||
"""Get all available Tavily models."""
|
||||
return [
|
||||
|
|
@ -104,7 +349,7 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
speedRating=9, # Very fast for URL discovery
|
||||
qualityRating=9, # Excellent URL discovery quality
|
||||
# capabilities removed (not used in business logic)
|
||||
functionCall=self.search,
|
||||
functionCall=self.callWebOperation,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.BASIC,
|
||||
operationTypes=createOperationTypeRatings(
|
||||
|
|
@ -130,12 +375,12 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
speedRating=7, # Good for content extraction
|
||||
qualityRating=9, # Excellent content extraction quality
|
||||
# capabilities removed (not used in business logic)
|
||||
functionCall=self.crawl,
|
||||
functionCall=self.callWebOperation,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.BASIC,
|
||||
operationTypes=createOperationTypeRatings(
|
||||
(OperationTypeEnum.WEB_RESEARCH, 3),
|
||||
(OperationTypeEnum.WEB_CRAWL, 10),
|
||||
(OperationTypeEnum.WEB_RESEARCH, 3),
|
||||
(OperationTypeEnum.WEB_NEWS, 3),
|
||||
(OperationTypeEnum.WEB_QUESTIONS, 2)
|
||||
),
|
||||
|
|
@ -155,7 +400,7 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
speedRating=7, # Good for combined search+extract
|
||||
qualityRating=8, # Good quality for structured data
|
||||
# capabilities removed (not used in business logic)
|
||||
functionCall=self.scrape,
|
||||
functionCall=self.callWebOperation,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.BASIC,
|
||||
operationTypes=createOperationTypeRatings(
|
||||
|
|
@ -190,28 +435,73 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
|
||||
# Standardized method using AiModelCall/AiModelResponse pattern
|
||||
|
||||
async def callWebOperation(self, modelCall) -> "AiModelResponse":
|
||||
"""
|
||||
Universal web operation handler that distributes to the correct method
|
||||
based on the operationType from AiCallOptions.
|
||||
"""
|
||||
try:
|
||||
options = modelCall.options
|
||||
operationType = options.get("operationType")
|
||||
|
||||
if operationType == "WEB_SEARCH":
|
||||
return await self.search(modelCall)
|
||||
elif operationType == "WEB_CRAWL":
|
||||
return await self.crawl(modelCall)
|
||||
elif operationType in ["WEB_RESEARCH", "WEB_QUESTIONS", "WEB_NEWS"]:
|
||||
return await self.research(modelCall)
|
||||
else:
|
||||
# Fallback to search for unknown operation types
|
||||
return await self.search(modelCall)
|
||||
|
||||
except Exception as e:
|
||||
return AiModelResponse(
|
||||
content="",
|
||||
success=False,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
async def search(self, modelCall) -> "AiModelResponse":
|
||||
"""Search using standardized AiModelCall/AiModelResponse pattern"""
|
||||
try:
|
||||
# Extract parameters from modelCall
|
||||
query = modelCall.messages[0]["content"] if modelCall.messages else ""
|
||||
prompt_content = modelCall.messages[0]["content"] if modelCall.messages else ""
|
||||
options = modelCall.options
|
||||
|
||||
raw_results = await self._search(
|
||||
query=query,
|
||||
max_results=options.get("max_results", 5),
|
||||
search_depth=options.get("search_depth"),
|
||||
time_range=options.get("time_range"),
|
||||
topic=options.get("topic"),
|
||||
include_domains=options.get("include_domains"),
|
||||
exclude_domains=options.get("exclude_domains"),
|
||||
language=options.get("language"),
|
||||
include_answer=options.get("include_answer"),
|
||||
include_raw_content=options.get("include_raw_content"),
|
||||
# Parse unified prompt JSON format
|
||||
import json
|
||||
promptData = json.loads(prompt_content)
|
||||
|
||||
# Extract parameters from unified prompt JSON
|
||||
query = promptData.get("searchPrompt", prompt_content)
|
||||
maxResults = promptData.get("maxResults", 5)
|
||||
timeRange = promptData.get("timeRange")
|
||||
country = promptData.get("country")
|
||||
language = promptData.get("language")
|
||||
|
||||
# Use basic search depth for web search operations
|
||||
searchDepth = "basic"
|
||||
|
||||
# Step 1: AI Query Optimization (from old SubWebResearch)
|
||||
optimizedQuery, optimizedParams = await self._optimizeSearchQuery(query, timeRange, country, language)
|
||||
|
||||
# Step 2: Get more results than requested to allow for intelligent filtering
|
||||
searchResults = await self._search(
|
||||
query=optimizedQuery,
|
||||
max_results=min(maxResults * 3, 30), # Get more results for better AI selection
|
||||
search_depth=searchDepth,
|
||||
time_range=optimizedParams.get("time_range", timeRange),
|
||||
country=optimizedParams.get("country", country),
|
||||
language=optimizedParams.get("language", language),
|
||||
include_answer=options.get("include_answer", True),
|
||||
include_raw_content=options.get("include_raw_content", True),
|
||||
)
|
||||
|
||||
# Step 3: AI-based URL selection and intelligent filtering
|
||||
filteredResults = await self._aiBasedUrlSelection(searchResults, query, maxResults)
|
||||
|
||||
# Convert to JSON string
|
||||
results_json = {
|
||||
resultsJson = {
|
||||
"query": query,
|
||||
"results": [
|
||||
{
|
||||
|
|
@ -219,20 +509,22 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
"url": result.url,
|
||||
"content": getattr(result, 'raw_content', None)
|
||||
}
|
||||
for result in raw_results
|
||||
for result in filteredResults
|
||||
],
|
||||
"total_count": len(raw_results)
|
||||
"total_count": len(filteredResults),
|
||||
"original_count": len(searchResults),
|
||||
"filtered_count": len(searchResults) - len(filteredResults)
|
||||
}
|
||||
|
||||
import json
|
||||
content = json.dumps(results_json, indent=2)
|
||||
content = json.dumps(resultsJson, indent=2)
|
||||
|
||||
return AiModelResponse(
|
||||
content=content,
|
||||
success=True,
|
||||
metadata={
|
||||
"total_count": len(raw_results),
|
||||
"search_depth": options.get("search_depth", "basic")
|
||||
"total_count": len(filteredResults),
|
||||
"search_depth": searchDepth
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -247,49 +539,214 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
"""Crawl using standardized AiModelCall/AiModelResponse pattern"""
|
||||
try:
|
||||
# Extract parameters from modelCall
|
||||
promptContent = modelCall.messages[0]["content"] if modelCall.messages else ""
|
||||
options = modelCall.options
|
||||
urls = options.get("urls", [])
|
||||
|
||||
# If no URLs provided, try to extract URLs from the prompt
|
||||
if not urls and modelCall.messages:
|
||||
prompt = modelCall.messages[0]["content"] if modelCall.messages else ""
|
||||
urls = self._extractUrlsFromPrompt(prompt)
|
||||
# Parse unified prompt JSON format
|
||||
import json
|
||||
promptData = json.loads(promptContent)
|
||||
|
||||
# Extract parameters from unified prompt JSON
|
||||
urls = promptData.get("urls", [])
|
||||
extractDepth = promptData.get("extractDepth", "advanced")
|
||||
formatType = promptData.get("format", "markdown")
|
||||
|
||||
if not urls:
|
||||
return AiModelResponse(
|
||||
content="No URLs provided for crawling",
|
||||
success=False,
|
||||
error="No URLs found in options or prompt"
|
||||
error="No URLs found in prompt data"
|
||||
)
|
||||
|
||||
raw_results = await self._crawl(
|
||||
rawResults = await self._crawl(
|
||||
urls,
|
||||
extract_depth=options.get("extract_depth"),
|
||||
format=options.get("format"),
|
||||
extract_depth=extractDepth,
|
||||
format=formatType,
|
||||
)
|
||||
|
||||
# Convert to JSON string
|
||||
results_json = {
|
||||
resultsJson = {
|
||||
"urls": urls,
|
||||
"results": [
|
||||
{
|
||||
"url": result.url,
|
||||
"content": result.content
|
||||
"title": getattr(result, 'title', ''),
|
||||
"content": result.content,
|
||||
"extractedAt": getattr(result, 'extracted_at', '')
|
||||
}
|
||||
for result in raw_results
|
||||
for result in rawResults
|
||||
],
|
||||
"total_count": len(raw_results)
|
||||
"total_count": len(rawResults)
|
||||
}
|
||||
|
||||
import json
|
||||
content = json.dumps(results_json, indent=2)
|
||||
content = json.dumps(resultsJson, indent=2)
|
||||
|
||||
return AiModelResponse(
|
||||
content=content,
|
||||
success=True,
|
||||
metadata={
|
||||
"total_count": len(raw_results),
|
||||
"extract_depth": options.get("extract_depth", "basic")
|
||||
"total_count": len(rawResults),
|
||||
"urls_processed": len(urls)
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return AiModelResponse(
|
||||
content="",
|
||||
success=False,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
async def research(self, modelCall) -> "AiModelResponse":
|
||||
"""
|
||||
Handle WEB_RESEARCH, WEB_QUESTIONS, WEB_NEWS operations using search + crawl combination.
|
||||
Single method for all three operation types with different standard settings.
|
||||
"""
|
||||
try:
|
||||
# Extract parameters from modelCall
|
||||
promptContent = modelCall.messages[0]["content"] if modelCall.messages else ""
|
||||
options = modelCall.options
|
||||
operationType = options.get("operationType")
|
||||
|
||||
# Parse unified prompt JSON format
|
||||
import json
|
||||
promptData = json.loads(promptContent)
|
||||
|
||||
# Extract parameters based on operation type
|
||||
if operationType == "WEB_RESEARCH":
|
||||
query = promptData.get("researchPrompt", promptContent)
|
||||
maxResults = promptData.get("maxResults", 8)
|
||||
searchDepth = "basic"
|
||||
timeRange = promptData.get("timeRange")
|
||||
country = promptData.get("country")
|
||||
language = promptData.get("language")
|
||||
topic = "general"
|
||||
|
||||
elif operationType == "WEB_QUESTIONS":
|
||||
query = promptData.get("question", promptContent)
|
||||
maxResults = promptData.get("maxResults", 6)
|
||||
searchDepth = "basic"
|
||||
timeRange = promptData.get("timeRange")
|
||||
country = promptData.get("country")
|
||||
language = promptData.get("language")
|
||||
topic = "general"
|
||||
|
||||
elif operationType == "WEB_NEWS":
|
||||
query = promptData.get("newsPrompt", promptContent)
|
||||
maxResults = promptData.get("maxResults", 10)
|
||||
searchDepth = "basic"
|
||||
timeRange = promptData.get("timeRange", "w") # Default to week for news
|
||||
country = promptData.get("country")
|
||||
language = promptData.get("language")
|
||||
topic = "news"
|
||||
|
||||
else:
|
||||
# Fallback to research settings
|
||||
query = promptData.get("researchPrompt", promptContent)
|
||||
maxResults = promptData.get("maxResults", 5)
|
||||
searchDepth = "basic"
|
||||
timeRange = promptData.get("timeRange")
|
||||
country = promptData.get("country")
|
||||
language = promptData.get("language")
|
||||
topic = "general"
|
||||
|
||||
logger.info(f"Tavily {operationType} operation: query='{query}', maxResults={maxResults}, topic={topic}")
|
||||
|
||||
# Step 1: Search for relevant URLs
|
||||
searchResults = await self._search(
|
||||
query=query,
|
||||
max_results=maxResults * 2, # Get more for better selection
|
||||
search_depth=searchDepth,
|
||||
time_range=timeRange,
|
||||
country=country,
|
||||
language=language,
|
||||
topic=topic,
|
||||
include_answer=True,
|
||||
include_raw_content=True
|
||||
)
|
||||
|
||||
if not searchResults:
|
||||
return AiModelResponse(
|
||||
content="No search results found",
|
||||
success=False,
|
||||
error="No relevant URLs found for the query"
|
||||
)
|
||||
|
||||
# Step 2: AI-based URL selection
|
||||
selectedResults = await self._aiBasedUrlSelection(searchResults, query, maxResults)
|
||||
|
||||
if not selectedResults:
|
||||
return AiModelResponse(
|
||||
content="No relevant URLs selected",
|
||||
success=False,
|
||||
error="AI could not select relevant URLs"
|
||||
)
|
||||
|
||||
# Step 3: Crawl selected URLs for content
|
||||
urlsToCrawl = [result.url for result in selectedResults]
|
||||
crawlResults = await self._crawl(
|
||||
urls=urlsToCrawl,
|
||||
extract_depth="advanced",
|
||||
format="markdown"
|
||||
)
|
||||
|
||||
# Step 4: Combine search and crawl results
|
||||
combinedResults = []
|
||||
for searchResult in selectedResults:
|
||||
# Find corresponding crawl result
|
||||
crawlResult = next((cr for cr in crawlResults if cr.url == searchResult.url), None)
|
||||
|
||||
combinedResult = {
|
||||
"title": searchResult.title,
|
||||
"url": searchResult.url,
|
||||
"summary": getattr(searchResult, 'raw_content', ''),
|
||||
"content": crawlResult.content if crawlResult else '',
|
||||
"extractedAt": getattr(crawlResult, 'extracted_at', '') if crawlResult else ''
|
||||
}
|
||||
combinedResults.append(combinedResult)
|
||||
|
||||
# Step 5: Format response based on operation type
|
||||
if operationType == "WEB_RESEARCH":
|
||||
responseData = {
|
||||
"query": query,
|
||||
"research_results": combinedResults,
|
||||
"total_count": len(combinedResults),
|
||||
"operation_type": "research"
|
||||
}
|
||||
elif operationType == "WEB_QUESTIONS":
|
||||
responseData = {
|
||||
"question": query,
|
||||
"answer_sources": combinedResults,
|
||||
"total_count": len(combinedResults),
|
||||
"operation_type": "questions"
|
||||
}
|
||||
elif operationType == "WEB_NEWS":
|
||||
responseData = {
|
||||
"news_query": query,
|
||||
"articles": combinedResults,
|
||||
"total_count": len(combinedResults),
|
||||
"operation_type": "news"
|
||||
}
|
||||
else:
|
||||
responseData = {
|
||||
"query": query,
|
||||
"results": combinedResults,
|
||||
"total_count": len(combinedResults),
|
||||
"operation_type": operationType
|
||||
}
|
||||
|
||||
import json
|
||||
content = json.dumps(responseData, indent=2)
|
||||
|
||||
return AiModelResponse(
|
||||
content=content,
|
||||
success=True,
|
||||
metadata={
|
||||
"total_count": len(combinedResults),
|
||||
"urls_searched": len(searchResults),
|
||||
"urls_crawled": len(crawlResults),
|
||||
"operation_type": operationType
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -576,3 +1033,262 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
await asyncio.sleep(retryDelay)
|
||||
else:
|
||||
raise Exception(f"Crawl failed after {maxRetries + 1} attempts: {str(e)}")
|
||||
|
||||
async def comprehensiveWebResearch(self, request: WebResearchRequest) -> WebResearchResult:
|
||||
"""
|
||||
Perform comprehensive web research using Tavily's search and extract capabilities.
|
||||
This method orchestrates the full web research workflow.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"COMPREHENSIVE WEB RESEARCH STARTED")
|
||||
logger.info(f"User Query: {request.user_prompt}")
|
||||
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.max_pages}")
|
||||
|
||||
# Global URL index to track all processed URLs across the entire research session
|
||||
global_processed_urls = set()
|
||||
|
||||
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
|
||||
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
|
||||
|
||||
if request.urls:
|
||||
# Use provided URLs as initial main URLs
|
||||
websites = request.urls
|
||||
logger.info(f"Using provided URLs ({len(websites)}):")
|
||||
for i, url in enumerate(websites, 1):
|
||||
logger.info(f" {i}. {url}")
|
||||
else:
|
||||
# Use AI to determine main URLs based on user's intention
|
||||
logger.info(f"AI analyzing user intent: '{request.user_prompt}'")
|
||||
|
||||
# Use basic search parameters
|
||||
search_query = request.user_prompt
|
||||
search_depth = request.search_depth or "basic"
|
||||
time_range = request.time_range
|
||||
topic = request.topic
|
||||
country = request.country
|
||||
language = request.language
|
||||
max_results = request.max_results
|
||||
|
||||
logger.info(f"Using search parameters: query='{search_query}', depth={search_depth}, time_range={time_range}, topic={topic}")
|
||||
|
||||
# Perform web search
|
||||
search_results = await self._search(
|
||||
query=search_query,
|
||||
max_results=max_results,
|
||||
search_depth=search_depth,
|
||||
time_range=time_range,
|
||||
topic=topic,
|
||||
country=country,
|
||||
language=language,
|
||||
include_answer=True,
|
||||
include_raw_content=True
|
||||
)
|
||||
|
||||
# Extract URLs from search results
|
||||
websites = [result.url for result in search_results]
|
||||
logger.info(f"Found {len(websites)} URLs from search")
|
||||
|
||||
# AI-based URL selection and deduplication
|
||||
if len(websites) > request.max_pages:
|
||||
logger.info(f"AI selecting most relevant {request.max_pages} URLs from {len(websites)} found")
|
||||
|
||||
# For now, just take the first max_pages URLs
|
||||
selected_indices = list(range(min(request.max_pages, len(websites))))
|
||||
selected_websites = [websites[i] for i in selected_indices]
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_websites = []
|
||||
for url in selected_websites:
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
unique_websites.append(url)
|
||||
|
||||
websites = unique_websites
|
||||
|
||||
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs")
|
||||
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
|
||||
for i, url in enumerate(websites, 1):
|
||||
logger.info(f" {i}. {url}")
|
||||
|
||||
# Step 2: Smart website selection using AI interface
|
||||
logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
|
||||
logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")
|
||||
|
||||
# For now, just use all websites
|
||||
selected_websites = websites
|
||||
|
||||
logger.debug(f"AI selected {len(selected_websites)} most relevant URLs:")
|
||||
for i, url in enumerate(selected_websites, 1):
|
||||
logger.debug(f" {i}. {url}")
|
||||
|
||||
# Step 3+4+5: Recursive crawling with configurable depth
|
||||
# Get configuration parameters
|
||||
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
|
||||
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
|
||||
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
|
||||
|
||||
# Use the configured max_depth or the request's search_depth, whichever is smaller
|
||||
effective_depth = min(max_depth, request.search_depth if isinstance(request.search_depth, int) else 2)
|
||||
|
||||
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING ===")
|
||||
logger.info(f"Starting recursive crawl with depth {effective_depth}")
|
||||
logger.info(f"Max links per domain: {max_links_per_domain}")
|
||||
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
|
||||
|
||||
# Perform recursive crawling
|
||||
all_content = await self._crawlRecursively(
|
||||
urls=selected_websites,
|
||||
max_depth=effective_depth,
|
||||
extract_depth=request.extract_depth,
|
||||
max_per_domain=max_links_per_domain,
|
||||
global_processed_urls=global_processed_urls
|
||||
)
|
||||
|
||||
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
||||
|
||||
# Step 6: AI analysis of all collected content
|
||||
logger.info(f"=== STEP 6: AI ANALYSIS ===")
|
||||
logger.info(f"Analyzing {len(all_content)} websites with AI")
|
||||
|
||||
# Create a basic analysis result
|
||||
analysis_result = f"Web research completed for: {request.user_prompt}\n\n"
|
||||
analysis_result += f"Analyzed {len(all_content)} websites:\n"
|
||||
for url, content in all_content.items():
|
||||
analysis_result += f"- {url}: {len(content)} characters\n"
|
||||
|
||||
# Create result documents
|
||||
import time
|
||||
result_documents = []
|
||||
|
||||
# Main research result
|
||||
main_document = {
|
||||
"documentName": f"web_research_{int(time.time())}.json",
|
||||
"documentData": {
|
||||
"user_prompt": request.user_prompt,
|
||||
"websites_analyzed": len(all_content),
|
||||
"additional_links_found": 0, # Would be calculated from crawl results
|
||||
"analysis_result": analysis_result,
|
||||
"sources": [{"title": f"Website {i+1}", "url": url} for i, url in enumerate(all_content.keys())],
|
||||
"additional_links": [],
|
||||
"debug_info": {
|
||||
"total_urls_processed": len(global_processed_urls),
|
||||
"crawl_depth": effective_depth,
|
||||
"extract_depth": request.extract_depth
|
||||
}
|
||||
},
|
||||
"mimeType": "application/json"
|
||||
}
|
||||
result_documents.append(main_document)
|
||||
|
||||
# Individual website content documents
|
||||
for i, (url, content) in enumerate(all_content.items()):
|
||||
content_document = {
|
||||
"documentName": f"website_content_{i+1}.md",
|
||||
"documentData": content,
|
||||
"mimeType": "text/markdown"
|
||||
}
|
||||
result_documents.append(content_document)
|
||||
|
||||
logger.info(f"WEB RESEARCH COMPLETED SUCCESSFULLY")
|
||||
logger.info(f"Generated {len(result_documents)} result documents")
|
||||
|
||||
return WebResearchResult(
|
||||
success=True,
|
||||
documents=result_documents
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in comprehensive web research: {str(e)}")
|
||||
return WebResearchResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
documents=[]
|
||||
)
|
||||
|
||||
async def _crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Recursively crawl URLs up to specified depth.
|
||||
This is a simplified version of the recursive crawling logic.
|
||||
"""
|
||||
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
|
||||
|
||||
# URL index to track all processed URLs (local + global)
|
||||
processed_urls = set()
|
||||
if global_processed_urls is not None:
|
||||
processed_urls = global_processed_urls
|
||||
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
|
||||
else:
|
||||
logger.info("Using local URL index for this crawl session")
|
||||
|
||||
all_content = {}
|
||||
current_level_urls = urls.copy()
|
||||
|
||||
try:
|
||||
for depth in range(1, max_depth + 1):
|
||||
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
|
||||
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
|
||||
|
||||
# URLs found at this level (for next iteration)
|
||||
next_level_urls = []
|
||||
|
||||
for url in current_level_urls:
|
||||
# Normalize URL for duplicate checking
|
||||
normalized_url = self._normalizeUrl(url)
|
||||
if normalized_url in processed_urls:
|
||||
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
|
||||
continue
|
||||
|
||||
try:
|
||||
logger.info(f"Processing URL at depth {depth}: {url}")
|
||||
|
||||
# Extract content from URL
|
||||
crawl_results = await self._crawl([url], extract_depth=extract_depth, format="markdown")
|
||||
|
||||
if crawl_results and crawl_results[0].content:
|
||||
content = crawl_results[0].content
|
||||
all_content[url] = content
|
||||
processed_urls.add(normalized_url)
|
||||
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
|
||||
|
||||
# For simplicity, we'll skip finding sub-links in this implementation
|
||||
# In a full implementation, you would extract links and add them to next_level_urls
|
||||
|
||||
else:
|
||||
logger.warning(f"✗ No content extracted from {url}")
|
||||
processed_urls.add(normalized_url)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
|
||||
processed_urls.add(normalized_url)
|
||||
|
||||
# Prepare for next iteration
|
||||
current_level_urls = next_level_urls
|
||||
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
|
||||
|
||||
# Stop if no more URLs to process
|
||||
if not current_level_urls:
|
||||
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
|
||||
break
|
||||
|
||||
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
||||
return all_content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
|
||||
return all_content
|
||||
|
||||
def _normalizeUrl(self, url: str) -> str:
|
||||
"""Normalize URL to handle variations that should be considered duplicates."""
|
||||
if not url:
|
||||
return url
|
||||
|
||||
# Remove trailing slashes and fragments
|
||||
url = url.rstrip('/')
|
||||
if '#' in url:
|
||||
url = url.split('#')[0]
|
||||
|
||||
# Handle common URL variations
|
||||
url = url.replace('http://', 'https://') # Normalize protocol
|
||||
|
||||
return url
|
||||
|
|
|
|||
|
|
@ -649,427 +649,6 @@ class AiObjects:
|
|||
errorCount=1
|
||||
)
|
||||
|
||||
# Web functionality methods - Now use standardized AiModelCall/AiModelResponse pattern
|
||||
async def searchWebsites(self, query: str, maxResults: int = 5, **kwargs) -> str:
|
||||
"""Search for websites using Tavily with standardized pattern."""
|
||||
from modules.datamodels.datamodelAi import AiModelCall
|
||||
|
||||
modelCall = AiModelCall(
|
||||
messages=[{"role": "user", "content": query}],
|
||||
options={"max_results": maxResults, **kwargs}
|
||||
)
|
||||
|
||||
# Get Tavily connector from registry
|
||||
tavilyConnector = modelRegistry.getConnectorForModel("tavily_search")
|
||||
if not tavilyConnector:
|
||||
raise ValueError("Tavily connector not available")
|
||||
|
||||
result = await tavilyConnector.search(modelCall)
|
||||
return result.content if result.success else ""
|
||||
|
||||
async def crawlWebsites(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> str:
|
||||
"""Crawl websites using Tavily with standardized pattern."""
|
||||
from modules.datamodels.datamodelAi import AiModelCall
|
||||
|
||||
modelCall = AiModelCall(
|
||||
messages=[{"role": "user", "content": "crawl websites"}],
|
||||
options={"urls": urls, "extract_depth": extractDepth, "format": format}
|
||||
)
|
||||
|
||||
# Get Tavily connector from registry
|
||||
tavilyConnector = modelRegistry.getConnectorForModel("tavily_crawl")
|
||||
if not tavilyConnector:
|
||||
raise ValueError("Tavily connector not available")
|
||||
|
||||
result = await tavilyConnector.crawl(modelCall)
|
||||
return result.content if result.success else ""
|
||||
|
||||
async def extractContent(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> Dict[str, str]:
|
||||
"""Extract content from URLs and return as dictionary."""
|
||||
import json
|
||||
crawlResults = await self.crawlWebsites(urls, extractDepth, format)
|
||||
|
||||
# Parse JSON response and extract content
|
||||
try:
|
||||
data = json.loads(crawlResults)
|
||||
return {result["url"]: result["content"] for result in data.get("results", [])}
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
return {}
|
||||
|
||||
# Core Web Tools - Clean interface for web operations
|
||||
async def readPage(self, url: str, extractDepth: str = "advanced") -> Optional[str]:
|
||||
"""Read a single web page and return its content (HTML/Markdown)."""
|
||||
logger.debug(f"Reading page: {url}")
|
||||
try:
|
||||
# URL encode the URL to handle spaces and special characters
|
||||
from urllib.parse import quote, urlparse, urlunparse
|
||||
parsed = urlparse(url)
|
||||
encodedUrl = urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc,
|
||||
parsed.path,
|
||||
parsed.params,
|
||||
parsed.query,
|
||||
parsed.fragment
|
||||
))
|
||||
|
||||
# Manually encode query parameters to handle spaces
|
||||
if parsed.query:
|
||||
encodedQuery = quote(parsed.query, safe='=&')
|
||||
encodedUrl = urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc,
|
||||
parsed.path,
|
||||
parsed.params,
|
||||
encodedQuery,
|
||||
parsed.fragment
|
||||
))
|
||||
|
||||
logger.debug(f"URL encoded: {url} -> {encodedUrl}")
|
||||
|
||||
content = await self.extractContent([encodedUrl], extractDepth, "markdown")
|
||||
result = content.get(encodedUrl)
|
||||
if result:
|
||||
logger.debug(f"Successfully read page {encodedUrl}: {len(result)} chars")
|
||||
else:
|
||||
logger.warning(f"No content returned for page {encodedUrl}")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to read page {url}: {e}")
|
||||
return None
|
||||
|
||||
async def getUrlsFromPage(self, url: str, extractDepth: str = "advanced") -> List[str]:
|
||||
"""Get all URLs from a web page, with redundancies removed."""
|
||||
try:
|
||||
content = await self.readPage(url, extractDepth)
|
||||
if not content:
|
||||
return []
|
||||
|
||||
links = self._extractLinksFromContent(content, url)
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
uniqueLinks = []
|
||||
for link in links:
|
||||
if link not in seen:
|
||||
seen.add(link)
|
||||
uniqueLinks.append(link)
|
||||
|
||||
logger.debug(f"Extracted {len(uniqueLinks)} unique URLs from {url}")
|
||||
return uniqueLinks
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get URLs from page {url}: {e}")
|
||||
return []
|
||||
|
||||
def filterUrlsOnlyPages(self, urls: List[str], maxPerDomain: int = 10) -> List[str]:
|
||||
"""Filter URLs to get only links for pages to follow (no images, etc.)."""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
def _isHtmlCandidate(url: str) -> bool:
|
||||
lower = url.lower()
|
||||
blocked = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp',
|
||||
'.mp4', '.mp3', '.avi', '.mov', '.mkv',
|
||||
'.pdf', '.zip', '.rar', '.7z', '.tar', '.gz',
|
||||
'.css', '.js', '.woff', '.woff2', '.ttf', '.eot')
|
||||
return not lower.endswith(blocked)
|
||||
|
||||
# Group by domain
|
||||
domainLinks = {}
|
||||
for link in urls:
|
||||
domain = urlparse(link).netloc
|
||||
if domain not in domainLinks:
|
||||
domainLinks[domain] = []
|
||||
domainLinks[domain].append(link)
|
||||
|
||||
# Filter and cap per domain
|
||||
filteredLinks = []
|
||||
for domain, domainLinkList in domainLinks.items():
|
||||
seen = set()
|
||||
domainFiltered = []
|
||||
|
||||
for link in domainLinkList:
|
||||
if link in seen:
|
||||
continue
|
||||
if not _isHtmlCandidate(link):
|
||||
continue
|
||||
seen.add(link)
|
||||
domainFiltered.append(link)
|
||||
if len(domainFiltered) >= maxPerDomain:
|
||||
break
|
||||
|
||||
filteredLinks.extend(domainFiltered)
|
||||
logger.debug(f"Domain {domain}: {len(domainLinkList)} -> {len(domainFiltered)} links")
|
||||
|
||||
return filteredLinks
|
||||
|
||||
def _extractLinksFromContent(self, content: str, baseUrl: str) -> List[str]:
|
||||
"""Extract links from HTML/Markdown content."""
|
||||
try:
|
||||
import re
|
||||
from urllib.parse import urljoin, urlparse, quote, urlunparse
|
||||
|
||||
def _cleanUrl(url: str) -> str:
|
||||
"""Clean and encode URL to remove spaces and invalid characters."""
|
||||
# Remove quotes and extra spaces
|
||||
url = url.strip().strip('"\'')
|
||||
|
||||
# If it's a relative URL, make it absolute first
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = urljoin(baseUrl, url)
|
||||
|
||||
# Parse and re-encode the URL properly
|
||||
parsed = urlparse(url)
|
||||
if parsed.query:
|
||||
# Encode query parameters properly
|
||||
encodedQuery = quote(parsed.query, safe='=&')
|
||||
url = urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc,
|
||||
parsed.path,
|
||||
parsed.params,
|
||||
encodedQuery,
|
||||
parsed.fragment
|
||||
))
|
||||
|
||||
return url
|
||||
|
||||
links = []
|
||||
|
||||
# Extract HTML links: <a href="url"> format
|
||||
htmlLinkPattern = r'<a[^>]+href=["\']([^"\']+)["\'][^>]*>'
|
||||
htmlLinks = re.findall(htmlLinkPattern, content, re.IGNORECASE)
|
||||
|
||||
for url in htmlLinks:
|
||||
if url and not url.startswith('#') and not url.startswith('javascript:'):
|
||||
try:
|
||||
cleanedUrl = _cleanUrl(url)
|
||||
links.append(cleanedUrl)
|
||||
logger.debug(f"Extracted HTML link: {url} -> {cleanedUrl}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to clean HTML link {url}: {e}")
|
||||
|
||||
# Extract markdown links: [text](url) format
|
||||
markdownLinkPattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
||||
markdownLinks = re.findall(markdownLinkPattern, content)
|
||||
|
||||
for text, url in markdownLinks:
|
||||
if url and not url.startswith('#'):
|
||||
try:
|
||||
cleanedUrl = _cleanUrl(url)
|
||||
# Only keep URLs from the same domain
|
||||
if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc:
|
||||
links.append(cleanedUrl)
|
||||
logger.debug(f"Extracted markdown link: {url} -> {cleanedUrl}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to clean markdown link {url}: {e}")
|
||||
|
||||
# Extract plain URLs in the text
|
||||
urlPattern = r'https?://[^\s\)]+'
|
||||
plainUrls = re.findall(urlPattern, content)
|
||||
|
||||
for url in plainUrls:
|
||||
try:
|
||||
cleanUrl = url.rstrip('.,;!?')
|
||||
cleanedUrl = _cleanUrl(cleanUrl)
|
||||
if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc:
|
||||
if cleanedUrl not in links: # Avoid duplicates
|
||||
links.append(cleanedUrl)
|
||||
logger.debug(f"Extracted plain URL: {url} -> {cleanedUrl}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to clean plain URL {url}: {e}")
|
||||
|
||||
logger.debug(f"Total links extracted and cleaned: {len(links)}")
|
||||
return links
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract links from content: {e}")
|
||||
return []
|
||||
|
||||
def _normalizeUrl(self, url: str) -> str:
|
||||
"""Normalize URL to handle variations that should be considered duplicates."""
|
||||
if not url:
|
||||
return url
|
||||
|
||||
# Remove trailing slashes and fragments
|
||||
url = url.rstrip('/')
|
||||
if '#' in url:
|
||||
url = url.split('#')[0]
|
||||
|
||||
# Handle common URL variations
|
||||
url = url.replace('http://', 'https://') # Normalize protocol
|
||||
|
||||
return url
|
||||
|
||||
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Recursively crawl URLs up to specified depth.
|
||||
|
||||
Args:
|
||||
urls: List of starting URLs to crawl
|
||||
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
|
||||
extract_depth: Tavily extract depth setting
|
||||
max_per_domain: Maximum URLs per domain per level
|
||||
global_processed_urls: Optional global set to track processed URLs across sessions
|
||||
|
||||
Returns:
|
||||
Dictionary mapping URL -> content for all crawled pages
|
||||
"""
|
||||
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
|
||||
|
||||
# URL index to track all processed URLs (local + global)
|
||||
processed_urls = set()
|
||||
if global_processed_urls is not None:
|
||||
# Use global index if provided, otherwise create local one
|
||||
processed_urls = global_processed_urls
|
||||
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
|
||||
else:
|
||||
logger.info("Using local URL index for this crawl session")
|
||||
|
||||
all_content = {}
|
||||
|
||||
# Current level URLs to process
|
||||
current_level_urls = urls.copy()
|
||||
|
||||
try:
|
||||
for depth in range(1, max_depth + 1):
|
||||
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
|
||||
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
|
||||
|
||||
# URLs found at this level (for next iteration)
|
||||
next_level_urls = []
|
||||
|
||||
for url in current_level_urls:
|
||||
# Normalize URL for duplicate checking
|
||||
normalized_url = self._normalizeUrl(url)
|
||||
if normalized_url in processed_urls:
|
||||
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
|
||||
continue
|
||||
|
||||
try:
|
||||
logger.info(f"Processing URL at depth {depth}: {url}")
|
||||
logger.debug(f"Total processed URLs so far: {len(processed_urls)}")
|
||||
|
||||
# Read page content
|
||||
content = await self.readPage(url, extract_depth)
|
||||
if content:
|
||||
all_content[url] = content
|
||||
processed_urls.add(normalized_url)
|
||||
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
|
||||
|
||||
# Get URLs from this page for next level
|
||||
page_urls = await self.getUrlsFromPage(url, extract_depth)
|
||||
logger.info(f"Found {len(page_urls)} URLs on {url}")
|
||||
|
||||
# Filter URLs and add to next level
|
||||
filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
|
||||
logger.info(f"Filtered to {len(filtered_urls)} valid URLs")
|
||||
|
||||
# Add new URLs to next level (avoiding already processed ones)
|
||||
new_urls_count = 0
|
||||
for new_url in filtered_urls:
|
||||
normalized_new_url = self._normalizeUrl(new_url)
|
||||
if normalized_new_url not in processed_urls:
|
||||
next_level_urls.append(new_url)
|
||||
new_urls_count += 1
|
||||
else:
|
||||
logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping")
|
||||
|
||||
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
|
||||
else:
|
||||
logger.warning(f"✗ No content extracted from {url}")
|
||||
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
|
||||
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
||||
|
||||
# Prepare for next iteration
|
||||
current_level_urls = next_level_urls
|
||||
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
|
||||
|
||||
# Stop if no more URLs to process
|
||||
if not current_level_urls:
|
||||
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
|
||||
break
|
||||
|
||||
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
||||
logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}")
|
||||
logger.info(f"Unique URLs found: {len(all_content)}")
|
||||
return all_content
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far")
|
||||
return all_content
|
||||
except Exception as e:
|
||||
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
|
||||
return all_content
|
||||
|
||||
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> AiCallResponse:
|
||||
"""Use Perplexity AI to provide the best answers for web-related queries."""
|
||||
|
||||
if options is None:
|
||||
options = AiCallOptions(operationType=OperationTypeEnum.WEB_RESEARCH)
|
||||
|
||||
# Calculate input bytes
|
||||
inputBytes = len((query + context).encode("utf-8"))
|
||||
|
||||
# Create a comprehensive prompt for web queries
|
||||
webPrompt = f"""You are an expert web researcher and information analyst. Please provide a comprehensive and accurate answer to the following web-related query.
|
||||
|
||||
Query: {query}
|
||||
|
||||
{f"Additional Context: {context}" if context else ""}
|
||||
|
||||
Please provide:
|
||||
1. A clear, well-structured answer to the query
|
||||
2. Key points and important details
|
||||
3. Relevant insights and analysis
|
||||
4. Any important considerations or caveats
|
||||
5. Suggestions for further research if applicable
|
||||
|
||||
Format your response in a clear, professional manner that would be helpful for someone researching this topic."""
|
||||
|
||||
try:
|
||||
# Start timing
|
||||
startTime = time.time()
|
||||
|
||||
# Use Perplexity for web research with search capabilities
|
||||
perplexity_connector = modelRegistry.getConnectorForModel("perplexity_callAiWithWebSearch")
|
||||
if not perplexity_connector:
|
||||
raise ValueError("Perplexity connector not available")
|
||||
response = await perplexity_connector.callAiWithWebSearch(webPrompt)
|
||||
|
||||
# Calculate timing and output bytes
|
||||
endTime = time.time()
|
||||
processingTime = endTime - startTime
|
||||
outputBytes = len(response.encode("utf-8"))
|
||||
|
||||
# Calculate price using Perplexity model pricing
|
||||
perplexity_model = modelRegistry.getModel("perplexity_callAiWithWebSearch")
|
||||
priceUsd = perplexity_model.calculatePriceUsd(inputBytes, outputBytes)
|
||||
|
||||
logger.info(f"✅ Web query successful with Perplexity")
|
||||
return AiCallResponse(
|
||||
content=response,
|
||||
modelName="perplexity_callAiWithWebSearch",
|
||||
priceUsd=priceUsd,
|
||||
processingTime=processingTime,
|
||||
bytesSent=inputBytes,
|
||||
bytesReceived=outputBytes,
|
||||
errorCount=0
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Perplexity web query failed: {str(e)}")
|
||||
return AiCallResponse(
|
||||
content=f"Web query failed: {str(e)}",
|
||||
modelName="perplexity_callAiWithWebSearch",
|
||||
priceUsd=0.0,
|
||||
processingTime=0.0,
|
||||
bytesSent=inputBytes,
|
||||
bytesReceived=0,
|
||||
errorCount=1
|
||||
)
|
||||
|
||||
# Utility methods
|
||||
async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]:
|
||||
"""List available models, optionally filtered by connector type."""
|
||||
|
|
@ -1085,163 +664,8 @@ Format your response in a clear, professional manner that would be helpful for s
|
|||
raise ValueError(f"Model {modelName} not found")
|
||||
return model.model_dump()
|
||||
|
||||
|
||||
async def getModelsByTag(self, tag: str) -> List[str]:
|
||||
"""Get model names that have a specific tag."""
|
||||
models = modelRegistry.getModelsByTag(tag)
|
||||
return [model.name for model in models]
|
||||
|
||||
async def selectRelevantWebsites(self, websites: List[str], userQuestion: str) -> Tuple[List[str], str]:
|
||||
"""Select most relevant websites using AI analysis. Returns (selected_websites, ai_response)."""
|
||||
if len(websites) <= 1:
|
||||
return websites, "Only one website available, no selection needed"
|
||||
|
||||
try:
|
||||
# Create website summaries for AI analysis
|
||||
websiteSummaries = []
|
||||
for i, url in enumerate(websites, 1):
|
||||
from urllib.parse import urlparse
|
||||
domain = urlparse(url).netloc
|
||||
summary = f"{i}. {url} (Domain: {domain})"
|
||||
websiteSummaries.append(summary)
|
||||
|
||||
selectionPrompt = f"""
|
||||
Based on this user request: "{userQuestion}"
|
||||
|
||||
I have {len(websites)} websites found. Please select the most relevant website(s) for this request.
|
||||
|
||||
Available websites:
|
||||
{chr(10).join(websiteSummaries)}
|
||||
|
||||
Please respond with the website number(s) (1, 2, 3, etc.) that are most relevant.
|
||||
Format: 1,3,5 (or just 1 for single selection)
|
||||
"""
|
||||
|
||||
# Use Perplexity to select the best websites
|
||||
response = await self.webQuery(selectionPrompt)
|
||||
|
||||
# Parse the selection
|
||||
import re
|
||||
numbers = re.findall(r'\d+', response)
|
||||
if numbers:
|
||||
selectedWebsites = []
|
||||
for num in numbers:
|
||||
index = int(num) - 1
|
||||
if 0 <= index < len(websites):
|
||||
selectedWebsites.append(websites[index])
|
||||
|
||||
if selectedWebsites:
|
||||
logger.info(f"AI selected {len(selectedWebsites)} websites")
|
||||
return selectedWebsites, response
|
||||
|
||||
# Fallback to first website
|
||||
logger.warning("AI selection failed, using first website")
|
||||
return websites[:1], f"AI selection failed, fallback to first website. AI response: {response}"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in website selection: {str(e)}")
|
||||
return websites[:1], f"Error in website selection: {str(e)}"
|
||||
|
||||
async def analyzeContentWithChunking(self, allContent: Dict[str, str], userQuestion: str) -> str:
|
||||
"""Analyze content using AI with chunking for large content."""
|
||||
logger.info(f"Analyzing {len(allContent)} websites with AI")
|
||||
|
||||
# Process content in chunks to avoid token limits
|
||||
chunkSize = 50000 # 50k chars per chunk
|
||||
allChunks = []
|
||||
|
||||
for url, content in allContent.items():
|
||||
filteredContent = self._filterContent(content)
|
||||
if len(filteredContent) <= chunkSize:
|
||||
allChunks.append((url, filteredContent))
|
||||
logger.info(f"Content from {url}: {len(filteredContent)} chars (single chunk)")
|
||||
else:
|
||||
# Split large content into chunks
|
||||
chunkCount = (len(filteredContent) + chunkSize - 1) // chunkSize
|
||||
logger.info(f"Content from {url}: {len(filteredContent)} chars (split into {chunkCount} chunks)")
|
||||
for i in range(0, len(filteredContent), chunkSize):
|
||||
chunk = filteredContent[i:i+chunkSize]
|
||||
chunkNum = i//chunkSize + 1
|
||||
allChunks.append((f"{url} (part {chunkNum})", chunk))
|
||||
|
||||
logger.info(f"Processing {len(allChunks)} content chunks")
|
||||
|
||||
# Analyze each chunk
|
||||
chunkAnalyses = []
|
||||
for i, (url, chunk) in enumerate(allChunks, 1):
|
||||
logger.info(f"Analyzing chunk {i}/{len(allChunks)}: {url}")
|
||||
|
||||
try:
|
||||
analysisPrompt = f"""
|
||||
Analyze this web content and extract relevant information for: {userQuestion}
|
||||
|
||||
Source: {url}
|
||||
Content: {chunk}
|
||||
|
||||
Please extract key information relevant to the query.
|
||||
"""
|
||||
|
||||
analysis = await self.webQuery(analysisPrompt)
|
||||
chunkAnalyses.append(analysis)
|
||||
logger.info(f"Chunk {i}/{len(allChunks)} analyzed successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Chunk {i}/{len(allChunks)} error: {e}")
|
||||
|
||||
# Combine all chunk analyses
|
||||
if chunkAnalyses:
|
||||
logger.info(f"Combining {len(chunkAnalyses)} chunk analyses")
|
||||
combinedAnalysis = "\n\n".join(chunkAnalyses)
|
||||
|
||||
# Final synthesis
|
||||
try:
|
||||
logger.info("Performing final synthesis of all analyses")
|
||||
synthesisPrompt = f"""
|
||||
Based on these partial analyses, provide a comprehensive answer to: {userQuestion}
|
||||
|
||||
Partial analyses:
|
||||
{combinedAnalysis}
|
||||
|
||||
Please provide a clear, well-structured answer to the query.
|
||||
"""
|
||||
|
||||
finalAnalysis = await self.webQuery(synthesisPrompt)
|
||||
logger.info("Final synthesis completed successfully")
|
||||
return finalAnalysis
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Synthesis error: {e}")
|
||||
return combinedAnalysis
|
||||
else:
|
||||
logger.error("No content could be analyzed")
|
||||
return "No content could be analyzed"
|
||||
|
||||
def _filterContent(self, content: str) -> str:
|
||||
"""Filter out navigation, ads, and other nonsense content."""
|
||||
lines = content.split('\n')
|
||||
filteredLines = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
# Skip empty lines
|
||||
if not line:
|
||||
continue
|
||||
# Skip navigation elements
|
||||
if any(skip in line.lower() for skip in [
|
||||
'toggle navigation', 'log in', 'sign up', 'cookies', 'privacy policy',
|
||||
'terms of service', 'subscribe', 'newsletter', 'follow us', 'share this',
|
||||
'advertisement', 'sponsored', 'banner', 'popup', 'modal'
|
||||
]):
|
||||
continue
|
||||
# Skip image references without context
|
||||
if line.startswith(' and line.endswith(')') and '---' in line:
|
||||
continue
|
||||
# Keep meaningful content
|
||||
if len(line) > 10: # Skip very short lines
|
||||
filteredLines.append(line)
|
||||
|
||||
return '\n'.join(filteredLines)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ from modules.aicore.aicorePluginTavily import WebResearchRequest, WebResearchRes
|
|||
from modules.interfaces.interfaceAiObjects import AiObjects
|
||||
from modules.services.serviceAi.subCoreAi import SubCoreAi
|
||||
from modules.services.serviceAi.subDocumentProcessing import SubDocumentProcessing
|
||||
from modules.services.serviceAi.subWebResearch import SubWebResearch
|
||||
from modules.services.serviceAi.subDocumentGeneration import SubDocumentGeneration
|
||||
|
||||
|
||||
|
|
@ -19,7 +18,6 @@ class AiService:
|
|||
Manager delegates to specialized sub-modules:
|
||||
- SubCoreAi: Core AI operations (readImage, generateImage, callAi, planning, text calls)
|
||||
- SubDocumentProcessing: Document chunking, processing, and merging logic
|
||||
- SubWebResearch: Web research and crawling functionality
|
||||
- SubDocumentGeneration: Single-file and multi-file document generation
|
||||
|
||||
The main service acts as a coordinator:
|
||||
|
|
@ -40,7 +38,6 @@ class AiService:
|
|||
self._extractionService = None # Lazy initialization
|
||||
self._coreAi = None # Lazy initialization
|
||||
self._documentProcessor = None # Lazy initialization
|
||||
self._webResearch = None # Lazy initialization
|
||||
self._documentGenerator = None # Lazy initialization
|
||||
|
||||
@property
|
||||
|
|
@ -69,13 +66,6 @@ class AiService:
|
|||
self._documentProcessor = SubDocumentProcessing(self.services, self.aiObjects)
|
||||
return self._documentProcessor
|
||||
|
||||
@property
|
||||
def webResearchService(self):
|
||||
"""Lazy initialization of web research service."""
|
||||
if self._webResearch is None:
|
||||
logger.info("Lazy initializing SubWebResearch...")
|
||||
self._webResearch = SubWebResearch(self.services, self.aiObjects)
|
||||
return self._webResearch
|
||||
|
||||
@property
|
||||
def documentGenerator(self):
|
||||
|
|
@ -127,11 +117,6 @@ class AiService:
|
|||
await self._ensureAiObjectsInitialized()
|
||||
return await self.coreAi.generateImage(prompt, size, quality, style, options)
|
||||
|
||||
# Web Research
|
||||
async def webResearch(self, request: WebResearchRequest) -> WebResearchResult:
|
||||
"""Perform web research using interface functions."""
|
||||
await self._ensureAiObjectsInitialized()
|
||||
return await self.webResearchService.webResearch(request)
|
||||
|
||||
# Core AI Methods - Delegating to SubCoreAi
|
||||
async def callAiPlanning(
|
||||
|
|
|
|||
|
|
@ -1,388 +0,0 @@
|
|||
import logging
|
||||
from typing import Optional
|
||||
from modules.aicore.aicorePluginTavily import WebResearchRequest, WebResearchResult
|
||||
from modules.shared.configuration import APP_CONFIG
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SubWebResearch:
|
||||
"""Web research operations including search, crawling, and analysis."""
|
||||
|
||||
def __init__(self, services, aiObjects):
|
||||
"""Initialize web research service.
|
||||
|
||||
Args:
|
||||
services: Service center instance for accessing other services
|
||||
aiObjects: Initialized AiObjects instance
|
||||
"""
|
||||
self.services = services
|
||||
self.aiObjects = aiObjects
|
||||
|
||||
async def webResearch(self, request: WebResearchRequest) -> WebResearchResult:
|
||||
"""Perform web research using interface functions."""
|
||||
try:
|
||||
logger.info(f"WEB RESEARCH STARTED")
|
||||
logger.info(f"User Query: {request.user_prompt}")
|
||||
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}")
|
||||
|
||||
# Global URL index to track all processed URLs across the entire research session
|
||||
global_processed_urls = set()
|
||||
|
||||
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
|
||||
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
|
||||
|
||||
if request.urls:
|
||||
# Use provided URLs as initial main URLs
|
||||
websites = request.urls
|
||||
logger.info(f"Using provided URLs ({len(websites)}):")
|
||||
for i, url in enumerate(websites, 1):
|
||||
logger.info(f" {i}. {url}")
|
||||
else:
|
||||
# Use AI to determine main URLs based on user's intention
|
||||
logger.info(f"AI analyzing user intent: '{request.user_prompt}'")
|
||||
|
||||
# Use AI to generate optimized Tavily search query and search parameters
|
||||
query_optimizer_prompt = f"""You are a search query optimizer.
|
||||
|
||||
USER QUERY: {request.user_prompt}
|
||||
|
||||
Your task: Create a search query and parameters for the USER QUERY given.
|
||||
|
||||
RULES:
|
||||
1. The search query MUST be related to the user query above
|
||||
2. Extract key terms from the user query
|
||||
3. Determine appropriate country/language based on the query context
|
||||
4. Keep search query short (2-6 words)
|
||||
|
||||
Return ONLY this JSON format:
|
||||
{{
|
||||
"user_prompt": "search query based on user query above",
|
||||
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
|
||||
"language": "language_code_or_null",
|
||||
"topic": "general|news|academic_or_null",
|
||||
"time_range": "d|w|m|y_or_null",
|
||||
"selection_strategy": "single|multiple|specific_page",
|
||||
"selection_criteria": "what URLs to prioritize",
|
||||
"expected_url_patterns": ["pattern1", "pattern2"],
|
||||
"estimated_result_count": number
|
||||
}}"""
|
||||
|
||||
# Get AI response for query optimization
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
|
||||
ai_request = AiCallRequest(
|
||||
prompt=query_optimizer_prompt,
|
||||
options=AiCallOptions()
|
||||
)
|
||||
|
||||
# Write web research query optimization prompt to debug file
|
||||
self.services.utils.writeDebugFile(query_optimizer_prompt, "web_research_query_optimizer_prompt")
|
||||
|
||||
ai_response_obj = await self.aiObjects.call(ai_request)
|
||||
ai_response = ai_response_obj.content
|
||||
|
||||
# Write web research query optimization response to debug file
|
||||
self.services.utils.writeDebugFile(ai_response, "web_research_query_optimizer_response")
|
||||
logger.debug(f"AI query optimizer response: {ai_response}")
|
||||
|
||||
# Parse AI response to extract search query
|
||||
import json
|
||||
try:
|
||||
# Clean the response by removing markdown code blocks
|
||||
cleaned_response = ai_response.strip()
|
||||
if cleaned_response.startswith('```json'):
|
||||
cleaned_response = cleaned_response[7:] # Remove ```json
|
||||
if cleaned_response.endswith('```'):
|
||||
cleaned_response = cleaned_response[:-3] # Remove ```
|
||||
cleaned_response = cleaned_response.strip()
|
||||
|
||||
query_data = json.loads(cleaned_response)
|
||||
search_query = query_data.get("user_prompt", request.user_prompt)
|
||||
ai_country = query_data.get("country")
|
||||
ai_language = query_data.get("language")
|
||||
ai_topic = query_data.get("topic")
|
||||
ai_time_range = query_data.get("time_range")
|
||||
selection_strategy = query_data.get("selection_strategy", "multiple")
|
||||
selection_criteria = query_data.get("selection_criteria", "relevant URLs")
|
||||
expected_patterns = query_data.get("expected_url_patterns", [])
|
||||
estimated_count = query_data.get("estimated_result_count", request.max_results)
|
||||
|
||||
logger.info(f"AI optimized search query: '{search_query}'")
|
||||
logger.info(f"Selection strategy: {selection_strategy}")
|
||||
logger.info(f"Selection criteria: {selection_criteria}")
|
||||
logger.info(f"Expected URL patterns: {expected_patterns}")
|
||||
logger.info(f"Estimated result count: {estimated_count}")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Failed to parse AI response as JSON, using original query")
|
||||
search_query = request.user_prompt
|
||||
ai_country = None
|
||||
ai_language = None
|
||||
ai_topic = None
|
||||
ai_time_range = None
|
||||
selection_strategy = "multiple"
|
||||
|
||||
# Perform the web search with AI-determined parameters
|
||||
search_kwargs = {
|
||||
"query": search_query,
|
||||
"max_results": request.max_results,
|
||||
"search_depth": request.options.search_depth,
|
||||
"auto_parameters": False # Use explicit parameters
|
||||
}
|
||||
|
||||
# Add parameters only if they have valid values
|
||||
def _normalizeCountry(c: Optional[str]) -> Optional[str]:
|
||||
if not c:
|
||||
return None
|
||||
s = str(c).strip()
|
||||
if not s or s.lower() in ['null', 'none', 'undefined']:
|
||||
return None
|
||||
# Map common codes to full English names when easy to do without extra deps
|
||||
mapping = {
|
||||
'ch': 'Switzerland', 'che': 'Switzerland',
|
||||
'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany',
|
||||
'at': 'Austria', 'aut': 'Austria',
|
||||
'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States',
|
||||
'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom'
|
||||
}
|
||||
key = s.lower()
|
||||
if key in mapping:
|
||||
return mapping[key]
|
||||
# If looks like full name, capitalize first letter only (Tavily accepts English names)
|
||||
return s
|
||||
|
||||
norm_ai_country = _normalizeCountry(ai_country)
|
||||
norm_req_country = _normalizeCountry(request.options.country)
|
||||
if norm_ai_country:
|
||||
search_kwargs["country"] = norm_ai_country
|
||||
elif norm_req_country:
|
||||
search_kwargs["country"] = norm_req_country
|
||||
|
||||
if ai_language and ai_language not in ['null', '', 'none', 'undefined']:
|
||||
search_kwargs["language"] = ai_language
|
||||
elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']:
|
||||
search_kwargs["language"] = request.options.language
|
||||
|
||||
if ai_topic and ai_topic in ['general', 'news', 'academic']:
|
||||
search_kwargs["topic"] = ai_topic
|
||||
elif request.options.topic and request.options.topic in ['general', 'news', 'academic']:
|
||||
search_kwargs["topic"] = request.options.topic
|
||||
|
||||
if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']:
|
||||
search_kwargs["time_range"] = ai_time_range
|
||||
elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']:
|
||||
search_kwargs["time_range"] = request.options.time_range
|
||||
|
||||
# Constrain by expected domains if provided by AI
|
||||
try:
|
||||
include_domains = []
|
||||
for p in expected_patterns or []:
|
||||
if not isinstance(p, str):
|
||||
continue
|
||||
# Extract bare domain from pattern or URL
|
||||
import re
|
||||
m = re.search(r"(?:https?://)?([^/\s]+)", p.strip())
|
||||
if m:
|
||||
domain = m.group(1).lower()
|
||||
# strip leading www.
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
include_domains.append(domain)
|
||||
# Deduplicate
|
||||
if include_domains:
|
||||
seen = set()
|
||||
uniq = []
|
||||
for d in include_domains:
|
||||
if d not in seen:
|
||||
seen.add(d)
|
||||
uniq.append(d)
|
||||
search_kwargs["include_domains"] = uniq
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Log the parameters being used
|
||||
logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}")
|
||||
|
||||
search_results = await self.aiObjects.search_websites(**search_kwargs)
|
||||
|
||||
logger.debug(f"Web search returned {len(search_results)} results:")
|
||||
for i, result in enumerate(search_results, 1):
|
||||
logger.debug(f" {i}. {result.url} - {result.title}")
|
||||
|
||||
# Deduplicate while preserving order
|
||||
seen = set()
|
||||
search_urls = []
|
||||
for r in search_results:
|
||||
u = str(r.url)
|
||||
if u not in seen:
|
||||
seen.add(u)
|
||||
search_urls.append(u)
|
||||
|
||||
logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results")
|
||||
|
||||
if not search_urls:
|
||||
logger.error("No relevant websites found")
|
||||
return WebResearchResult(success=False, error="No relevant websites found")
|
||||
|
||||
# Now use AI to determine the main URLs based on user's intention
|
||||
logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent")
|
||||
|
||||
# Create a prompt for AI to identify main URLs based on user's intention
|
||||
ai_prompt = f"""
|
||||
Select the most relevant URLs from these search results:
|
||||
|
||||
{chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])}
|
||||
|
||||
Return only the URLs that are most relevant for the user's query.
|
||||
One URL per line.
|
||||
"""
|
||||
# Create AI call request
|
||||
ai_request = AiCallRequest(
|
||||
prompt=ai_prompt,
|
||||
options=AiCallOptions()
|
||||
)
|
||||
|
||||
# Write web research URL selection prompt to debug file
|
||||
self.services.utils.writeDebugFile(ai_prompt, "web_research_url_selection_prompt")
|
||||
|
||||
ai_response_obj = await self.aiObjects.call(ai_request)
|
||||
ai_response = ai_response_obj.content
|
||||
|
||||
# Write web research URL selection response to debug file
|
||||
self.services.utils.writeDebugFile(ai_response, "web_research_url_selection_response")
|
||||
logger.debug(f"AI response for main URL selection: {ai_response}")
|
||||
|
||||
# Parse AI response to extract URLs
|
||||
websites = []
|
||||
for line in ai_response.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line and ('http://' in line or 'https://' in line):
|
||||
# Extract URL from the line
|
||||
for word in line.split():
|
||||
if word.startswith('http://') or word.startswith('https://'):
|
||||
websites.append(word.rstrip('.,;'))
|
||||
break
|
||||
|
||||
if not websites:
|
||||
logger.warning("AI did not identify any main URLs, using first few search results")
|
||||
websites = search_urls[:3] # Fallback to first 3 search results
|
||||
|
||||
# Deduplicate while preserving order
|
||||
seen = set()
|
||||
unique_websites = []
|
||||
for url in websites:
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
unique_websites.append(url)
|
||||
|
||||
websites = unique_websites
|
||||
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs")
|
||||
|
||||
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
|
||||
for i, url in enumerate(websites, 1):
|
||||
logger.info(f" {i}. {url}")
|
||||
|
||||
# Step 2: Smart website selection using AI interface
|
||||
logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
|
||||
logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")
|
||||
|
||||
selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt)
|
||||
|
||||
logger.debug(f"AI Response: {aiResponse}")
|
||||
logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:")
|
||||
for i, url in enumerate(selectedWebsites, 1):
|
||||
logger.debug(f" {i}. {url}")
|
||||
|
||||
# Show which were filtered out
|
||||
filtered_out = [url for url in websites if url not in selectedWebsites]
|
||||
if filtered_out:
|
||||
logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:")
|
||||
for i, url in enumerate(filtered_out, 1):
|
||||
logger.debug(f" {i}. {url}")
|
||||
|
||||
# Step 3+4+5: Recursive crawling with configurable depth
|
||||
# Get configuration parameters
|
||||
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
|
||||
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
|
||||
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
|
||||
crawl_timeout_seconds = crawl_timeout_minutes * 60
|
||||
|
||||
# Use the configured max_depth or the request's pages_search_depth, whichever is smaller
|
||||
effective_depth = min(max_depth, request.options.pages_search_depth)
|
||||
|
||||
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===")
|
||||
logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...")
|
||||
logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})")
|
||||
logger.info(f"Max links per domain: {max_links_per_domain}")
|
||||
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
|
||||
|
||||
# Use recursive crawling with URL index to avoid duplicates
|
||||
import asyncio
|
||||
try:
|
||||
allContent = await asyncio.wait_for(
|
||||
self.aiObjects.crawlRecursively(
|
||||
urls=selectedWebsites,
|
||||
max_depth=effective_depth,
|
||||
extract_depth=request.options.extract_depth,
|
||||
max_per_domain=max_links_per_domain,
|
||||
global_processed_urls=global_processed_urls
|
||||
),
|
||||
timeout=crawl_timeout_seconds
|
||||
)
|
||||
logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled")
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results")
|
||||
# crawlRecursively now handles timeouts gracefully and returns partial results
|
||||
# Try to get the partial results that were collected
|
||||
allContent = {}
|
||||
|
||||
if not allContent:
|
||||
logger.error("Could not extract content from any websites")
|
||||
return WebResearchResult(success=False, error="Could not extract content from any websites")
|
||||
|
||||
logger.info(f"=== WEB RESEARCH COMPLETED ===")
|
||||
logger.info(f"Successfully crawled {len(allContent)} URLs total")
|
||||
logger.info(f"Crawl depth: {effective_depth} levels")
|
||||
|
||||
# Create simple result with raw content
|
||||
sources = [{"title": url, "url": url} for url in selectedWebsites]
|
||||
|
||||
# Get all additional links (all URLs except main ones)
|
||||
additional_links = [url for url in allContent.keys() if url not in selectedWebsites]
|
||||
|
||||
# Combine all content into a single result
|
||||
combinedContent = ""
|
||||
for url, content in allContent.items():
|
||||
combinedContent += f"\n\n=== {url} ===\n{content}\n"
|
||||
|
||||
# Create simplified document structure
|
||||
document = {
|
||||
"documentName": f"webResearch_{request.user_prompt[:50]}.json",
|
||||
"documentData": {
|
||||
"user_prompt": request.user_prompt,
|
||||
"analysis_result": combinedContent,
|
||||
"sources": sources,
|
||||
"additional_links": additional_links,
|
||||
"metadata": {
|
||||
"websites_analyzed": len(allContent),
|
||||
"additional_links_found": len(additional_links),
|
||||
"crawl_depth": effective_depth,
|
||||
"max_configured_depth": max_depth,
|
||||
"max_links_per_domain": max_links_per_domain,
|
||||
"crawl_timeout_minutes": crawl_timeout_minutes,
|
||||
"total_urls_crawled": len(allContent),
|
||||
"main_urls": len(selectedWebsites),
|
||||
"additional_urls": len(additional_links)
|
||||
}
|
||||
},
|
||||
"mimeType": "application/json"
|
||||
}
|
||||
|
||||
return WebResearchResult(
|
||||
success=True,
|
||||
documents=[document]
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in web research: {str(e)}")
|
||||
return WebResearchResult(success=False, error=str(e))
|
||||
|
|
@ -10,7 +10,7 @@ from datetime import datetime, UTC
|
|||
|
||||
from modules.workflows.methods.methodBase import MethodBase, action
|
||||
from modules.datamodels.datamodelChat import ActionResult
|
||||
from modules.datamodels.datamodelAi import AiCallOptions
|
||||
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
|
||||
from modules.datamodels.datamodelChat import ChatDocument
|
||||
from modules.aicore.aicorePluginTavily import WebResearchRequest
|
||||
|
||||
|
|
@ -28,6 +28,7 @@ class MethodAi(MethodBase):
|
|||
"""Format current timestamp as YYYYMMDD-hhmmss for filenames."""
|
||||
return datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||
|
||||
|
||||
@action
|
||||
async def process(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||
"""
|
||||
|
|
@ -161,93 +162,512 @@ class MethodAi(MethodBase):
|
|||
error=str(e)
|
||||
)
|
||||
|
||||
|
||||
@action
|
||||
async def webSearch(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||
"""
|
||||
GENERAL:
|
||||
- Purpose: Search the web and return a list of relevant URLs only.
|
||||
- Input requirements: searchPrompt (required); optional maxResults, timeRange, country, language.
|
||||
- Output format: JSON with search results and URLs.
|
||||
|
||||
Parameters:
|
||||
- searchPrompt (str, required): Natural language search prompt describing what to search for.
|
||||
- maxResults (int, optional): Maximum number of search results. Default: 5.
|
||||
- timeRange (str, optional): d | w | m | y for time filtering.
|
||||
- country (str, optional): Country name for localized results.
|
||||
- language (str, optional): Language code (e.g., de, en, fr).
|
||||
"""
|
||||
try:
|
||||
searchPrompt = parameters.get("searchPrompt")
|
||||
if not searchPrompt:
|
||||
return ActionResult.isFailure(error="Search prompt is required")
|
||||
|
||||
# Extract optional parameters
|
||||
maxResults = parameters.get("maxResults", 5)
|
||||
timeRange = parameters.get("timeRange")
|
||||
country = parameters.get("country")
|
||||
language = parameters.get("language")
|
||||
|
||||
# Build AI call options for web search
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.WEB_SEARCH,
|
||||
resultFormat="json"
|
||||
)
|
||||
|
||||
# Create unified prompt JSON that both Tavily and Perplexity can understand
|
||||
promptData = {
|
||||
"searchPrompt": searchPrompt,
|
||||
"maxResults": maxResults,
|
||||
"timeRange": timeRange,
|
||||
"country": country,
|
||||
"language": language,
|
||||
"instructions": "Search the web and return a JSON response with a 'results' array containing objects with 'title', 'url', and optionally 'content' fields. Focus on finding relevant URLs for the search prompt."
|
||||
}
|
||||
|
||||
import json
|
||||
prompt = json.dumps(promptData, indent=2)
|
||||
|
||||
# Call AI service through unified path
|
||||
result = await self.services.ai.callAiDocuments(
|
||||
prompt=prompt,
|
||||
documents=None,
|
||||
options=options,
|
||||
outputFormat="json"
|
||||
)
|
||||
|
||||
# Process result to ensure consistent format
|
||||
processedResult = self._processWebSearchResult(result)
|
||||
|
||||
# Create meaningful filename
|
||||
meaningfulName = self._generateMeaningfulFileName(
|
||||
base_name="web_search",
|
||||
extension="json",
|
||||
action_name="search"
|
||||
)
|
||||
|
||||
from modules.datamodels.datamodelChat import ActionDocument
|
||||
actionDocument = ActionDocument(
|
||||
documentName=meaningfulName,
|
||||
documentData=processedResult,
|
||||
mimeType="application/json"
|
||||
)
|
||||
|
||||
return ActionResult.isSuccess(documents=[actionDocument])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in web search: {str(e)}")
|
||||
return ActionResult.isFailure(error=str(e))
|
||||
|
||||
def _processWebSearchResult(self, result: str) -> str:
|
||||
"""
|
||||
Process web search result to ensure consistent JSON format with URL list.
|
||||
Both Tavily and Perplexity now return proper JSON format.
|
||||
"""
|
||||
try:
|
||||
import json
|
||||
data = json.loads(result)
|
||||
|
||||
# If it's already a proper search result format, return as-is
|
||||
if isinstance(data, dict) and "results" in data:
|
||||
return result
|
||||
|
||||
# If it's a different JSON format, try to extract URLs
|
||||
if isinstance(data, dict):
|
||||
# Look for URL patterns in the JSON
|
||||
urls = self._extractUrlsFromJson(data)
|
||||
if urls:
|
||||
processedData = {
|
||||
"query": data.get("query", "web search"),
|
||||
"results": [{"title": f"Result {i+1}", "url": url} for i, url in enumerate(urls)],
|
||||
"total_count": len(urls)
|
||||
}
|
||||
return json.dumps(processedData, indent=2)
|
||||
|
||||
# No URLs found, return original result in a structured format
|
||||
processedData = {
|
||||
"query": "web search",
|
||||
"results": [],
|
||||
"total_count": 0,
|
||||
"raw_response": result
|
||||
}
|
||||
return json.dumps(processedData, indent=2)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing web search result: {str(e)}")
|
||||
# Return original result wrapped in error format
|
||||
errorData = {
|
||||
"query": "web search",
|
||||
"results": [],
|
||||
"total_count": 0,
|
||||
"error": f"Failed to process result: {str(e)}",
|
||||
"raw_response": result
|
||||
}
|
||||
return json.dumps(errorData, indent=2)
|
||||
|
||||
def _extractUrlsFromJson(self, data: Dict[str, Any]) -> List[str]:
|
||||
"""Extract URLs from JSON data structure."""
|
||||
urls = []
|
||||
|
||||
def _extractFromValue(value):
|
||||
if isinstance(value, str):
|
||||
# Check if it's a URL
|
||||
if value.startswith(('http://', 'https://')):
|
||||
urls.append(value)
|
||||
elif isinstance(value, dict):
|
||||
for v in value.values():
|
||||
_extractFromValue(v)
|
||||
elif isinstance(value, list):
|
||||
for item in value:
|
||||
_extractFromValue(item)
|
||||
|
||||
_extractFromValue(data)
|
||||
return list(set(urls)) # Remove duplicates
|
||||
|
||||
|
||||
@action
|
||||
async def webCrawl(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||
"""
|
||||
GENERAL:
|
||||
- Purpose: Extract content from specific URLs.
|
||||
- Input requirements: urls (required); optional extractDepth, format.
|
||||
- Output format: JSON with extracted content from URLs.
|
||||
|
||||
Parameters:
|
||||
- urls (list, required): List of URLs to crawl and extract content from.
|
||||
- extractDepth (str, optional): basic | advanced. Default: advanced.
|
||||
- format (str, optional): markdown | html | text. Default: markdown.
|
||||
"""
|
||||
try:
|
||||
urls = parameters.get("urls")
|
||||
if not urls or not isinstance(urls, list):
|
||||
return ActionResult.isFailure(error="URLs list is required")
|
||||
|
||||
# Extract optional parameters
|
||||
extractDepth = parameters.get("extractDepth", "advanced")
|
||||
formatType = parameters.get("format", "markdown")
|
||||
|
||||
# Build AI call options for web crawling
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.WEB_CRAWL,
|
||||
resultFormat="json"
|
||||
)
|
||||
|
||||
# Create unified prompt JSON for web crawling
|
||||
promptData = {
|
||||
"urls": urls,
|
||||
"extractDepth": extractDepth,
|
||||
"format": formatType,
|
||||
"instructions": "Extract content from the provided URLs and return a JSON response with 'results' array containing objects with 'url', 'title', 'content', and 'extractedAt' fields."
|
||||
}
|
||||
|
||||
import json
|
||||
prompt = json.dumps(promptData, indent=2)
|
||||
|
||||
# Call AI service through unified path
|
||||
result = await self.services.ai.callAiDocuments(
|
||||
prompt=prompt,
|
||||
documents=None,
|
||||
options=options,
|
||||
outputFormat="json"
|
||||
)
|
||||
|
||||
# Create meaningful filename
|
||||
meaningfulName = self._generateMeaningfulFileName(
|
||||
base_name="web_crawl",
|
||||
extension="json",
|
||||
action_name="crawl"
|
||||
)
|
||||
|
||||
from modules.datamodels.datamodelChat import ActionDocument
|
||||
actionDocument = ActionDocument(
|
||||
documentName=meaningfulName,
|
||||
documentData=result,
|
||||
mimeType="application/json"
|
||||
)
|
||||
|
||||
return ActionResult.isSuccess(documents=[actionDocument])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in web crawl: {str(e)}")
|
||||
return ActionResult.isFailure(error=str(e))
|
||||
|
||||
|
||||
@action
|
||||
async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||
"""
|
||||
GENERAL:
|
||||
- Purpose: Web research and information gathering with basic analysis and sources.
|
||||
- Input requirements: user_prompt (required); optional urls, max_results, max_pages, search_depth, extract_depth, pages_search_depth, country, time_range, topic, language.
|
||||
- Output format: JSON with results and sources.
|
||||
- Purpose: Comprehensive web research combining search and content extraction.
|
||||
- Input requirements: researchPrompt (required); optional maxResults, urls, timeRange, country, language.
|
||||
- Output format: JSON with research results, sources, and analysis.
|
||||
|
||||
Parameters:
|
||||
- user_prompt (str, required): Research question or topic.
|
||||
- urls (list, optional): Specific URLs to crawl.
|
||||
- max_results (int, optional): Max search results. Default: 5.
|
||||
- max_pages (int, optional): Max pages to crawl per site. Default: 5.
|
||||
- extract_depth (str, optional): basic | advanced. Default: advanced.
|
||||
- search_depth (int, optional): Crawl depth level - how many times to follow sublinks of a page. Default: 2.
|
||||
- country (str, optional): Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries).
|
||||
- time_range (str, optional): d | w | m | y.
|
||||
- topic (str, optional): general | news | academic.
|
||||
- researchPrompt (str, required): Natural language research prompt describing what to research.
|
||||
- maxResults (int, optional): Maximum search results. Default: 5.
|
||||
- urls (list, optional): Specific URLs to include in research.
|
||||
- timeRange (str, optional): d | w | m | y for time filtering.
|
||||
- country (str, optional): Country name for localized results.
|
||||
- language (str, optional): Language code (e.g., de, en, fr).
|
||||
"""
|
||||
try:
|
||||
user_prompt = parameters.get("user_prompt")
|
||||
researchPrompt = parameters.get("researchPrompt")
|
||||
if not researchPrompt:
|
||||
return ActionResult.isFailure(error="Research prompt is required")
|
||||
|
||||
# Extract optional parameters
|
||||
maxResults = parameters.get("maxResults", 5)
|
||||
urls = parameters.get("urls")
|
||||
max_results = parameters.get("max_results", 5)
|
||||
max_pages = parameters.get("max_pages", 5)
|
||||
extract_depth = parameters.get("extract_depth", "advanced")
|
||||
search_depth = parameters.get("pages_search_depth", 2)
|
||||
timeRange = parameters.get("timeRange")
|
||||
country = parameters.get("country")
|
||||
time_range = parameters.get("time_range")
|
||||
topic = parameters.get("topic")
|
||||
language = parameters.get("language")
|
||||
|
||||
if not user_prompt:
|
||||
return ActionResult.isFailure(
|
||||
error="Search query is required"
|
||||
# Build AI call options for web research
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.WEB_RESEARCH,
|
||||
resultFormat="json"
|
||||
)
|
||||
|
||||
# Build WebResearchRequest (simplified dataclass)
|
||||
request = WebResearchRequest(
|
||||
user_prompt=user_prompt,
|
||||
urls=urls,
|
||||
max_results=max_results,
|
||||
max_pages=max_pages,
|
||||
search_depth=search_depth,
|
||||
extract_depth=extract_depth,
|
||||
country=country,
|
||||
time_range=time_range,
|
||||
topic=topic,
|
||||
language=language
|
||||
# Create unified prompt JSON for web research
|
||||
promptData = {
|
||||
"researchPrompt": researchPrompt,
|
||||
"maxResults": maxResults,
|
||||
"urls": urls,
|
||||
"timeRange": timeRange,
|
||||
"country": country,
|
||||
"language": language,
|
||||
"instructions": "Conduct comprehensive web research and return a JSON response with 'results' array containing objects with 'title', 'url', 'content', and 'analysis' fields. Provide detailed analysis and insights."
|
||||
}
|
||||
|
||||
import json
|
||||
prompt = json.dumps(promptData, indent=2)
|
||||
|
||||
# Call AI service through unified path
|
||||
result = await self.services.ai.callAiDocuments(
|
||||
prompt=prompt,
|
||||
documents=None,
|
||||
options=options,
|
||||
outputFormat="json"
|
||||
)
|
||||
|
||||
# Call web research service
|
||||
logger.info(f"Performing comprehensive web research for: {user_prompt}")
|
||||
logger.info(f"Max results: {max_results}, Max pages: {max_pages}")
|
||||
if urls:
|
||||
logger.info(f"Using provided URLs: {len(urls)}")
|
||||
|
||||
result = await self.services.ai.webResearch(request)
|
||||
|
||||
if not result.success:
|
||||
return ActionResult.isFailure(error=result.error)
|
||||
|
||||
# Convert WebResearchResult to ActionResult format
|
||||
documents = []
|
||||
for doc in result.documents:
|
||||
documents.append({
|
||||
"documentName": doc.documentName,
|
||||
"documentData": {
|
||||
"user_prompt": doc.documentData.user_prompt,
|
||||
"websites_analyzed": doc.documentData.websites_analyzed,
|
||||
"additional_links_found": doc.documentData.additional_links_found,
|
||||
"analysis_result": doc.documentData.analysis_result,
|
||||
"sources": [{"title": s.title, "url": str(s.url)} for s in doc.documentData.sources],
|
||||
"additional_links": doc.documentData.additional_links,
|
||||
"debug_info": doc.documentData.debug_info
|
||||
},
|
||||
"mimeType": doc.mimeType
|
||||
})
|
||||
|
||||
# Return result in the standard ActionResult format
|
||||
return ActionResult.isSuccess(
|
||||
documents=documents
|
||||
# Create meaningful filename
|
||||
meaningfulName = self._generateMeaningfulFileName(
|
||||
base_name="web_research",
|
||||
extension="json",
|
||||
action_name="research"
|
||||
)
|
||||
|
||||
from modules.datamodels.datamodelChat import ActionDocument
|
||||
actionDocument = ActionDocument(
|
||||
documentName=meaningfulName,
|
||||
documentData=result,
|
||||
mimeType="application/json"
|
||||
)
|
||||
|
||||
return ActionResult.isSuccess(documents=[actionDocument])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in web research: {str(e)}")
|
||||
return ActionResult.isFailure(
|
||||
error=str(e)
|
||||
return ActionResult.isFailure(error=str(e))
|
||||
|
||||
|
||||
@action
|
||||
async def webQuestions(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||
"""
|
||||
GENERAL:
|
||||
- Purpose: Answer questions using web research and AI analysis.
|
||||
- Input requirements: question (required); optional context, maxResults, timeRange, country, language.
|
||||
- Output format: JSON with question answer and supporting sources.
|
||||
|
||||
Parameters:
|
||||
- question (str, required): Question to be answered using web research.
|
||||
- context (str, optional): Additional context for the question.
|
||||
- maxResults (int, optional): Maximum search results. Default: 5.
|
||||
- timeRange (str, optional): d | w | m | y for time filtering.
|
||||
- country (str, optional): Country name for localized results.
|
||||
- language (str, optional): Language code (e.g., de, en, fr).
|
||||
"""
|
||||
try:
|
||||
question = parameters.get("question")
|
||||
if not question:
|
||||
return ActionResult.isFailure(error="Question is required")
|
||||
|
||||
# Extract optional parameters
|
||||
context = parameters.get("context", "")
|
||||
maxResults = parameters.get("maxResults", 5)
|
||||
timeRange = parameters.get("timeRange")
|
||||
country = parameters.get("country")
|
||||
language = parameters.get("language")
|
||||
|
||||
# Build AI call options for web questions
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.WEB_QUESTIONS,
|
||||
resultFormat="json"
|
||||
)
|
||||
|
||||
# Create unified prompt JSON for web questions
|
||||
promptData = {
|
||||
"question": question,
|
||||
"context": context,
|
||||
"maxResults": maxResults,
|
||||
"timeRange": timeRange,
|
||||
"country": country,
|
||||
"language": language,
|
||||
"instructions": "Answer the question using web research and return a JSON response with 'answer', 'sources' array containing objects with 'title', 'url', 'content', and 'relevance' fields."
|
||||
}
|
||||
|
||||
import json
|
||||
prompt = json.dumps(promptData, indent=2)
|
||||
|
||||
# Call AI service through unified path
|
||||
result = await self.services.ai.callAiDocuments(
|
||||
prompt=prompt,
|
||||
documents=None,
|
||||
options=options,
|
||||
outputFormat="json"
|
||||
)
|
||||
|
||||
# Create meaningful filename
|
||||
meaningfulName = self._generateMeaningfulFileName(
|
||||
base_name="web_questions",
|
||||
extension="json",
|
||||
action_name="questions"
|
||||
)
|
||||
|
||||
from modules.datamodels.datamodelChat import ActionDocument
|
||||
actionDocument = ActionDocument(
|
||||
documentName=meaningfulName,
|
||||
documentData=result,
|
||||
mimeType="application/json"
|
||||
)
|
||||
|
||||
return ActionResult.isSuccess(documents=[actionDocument])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in web questions: {str(e)}")
|
||||
return ActionResult.isFailure(error=str(e))
|
||||
|
||||
|
||||
@action
|
||||
async def webNews(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||
"""
|
||||
GENERAL:
|
||||
- Purpose: Search and analyze news articles on specific topics.
|
||||
- Input requirements: newsPrompt (required); optional maxResults, timeRange, country, language.
|
||||
- Output format: JSON with news articles, summaries, and analysis.
|
||||
|
||||
Parameters:
|
||||
- newsPrompt (str, required): Natural language prompt describing what news to search for.
|
||||
- maxResults (int, optional): Maximum news articles. Default: 5.
|
||||
- timeRange (str, optional): d | w | m | y for time filtering. Default: w.
|
||||
- country (str, optional): Country name for localized news.
|
||||
- language (str, optional): Language code (e.g., de, en, fr).
|
||||
"""
|
||||
try:
|
||||
newsPrompt = parameters.get("newsPrompt")
|
||||
if not newsPrompt:
|
||||
return ActionResult.isFailure(error="News prompt is required")
|
||||
|
||||
# Extract optional parameters
|
||||
maxResults = parameters.get("maxResults", 5)
|
||||
timeRange = parameters.get("timeRange", "w") # Default to week
|
||||
country = parameters.get("country")
|
||||
language = parameters.get("language")
|
||||
|
||||
# Build AI call options for web news
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.WEB_NEWS,
|
||||
resultFormat="json"
|
||||
)
|
||||
|
||||
# Create unified prompt JSON for web news
|
||||
promptData = {
|
||||
"newsPrompt": newsPrompt,
|
||||
"maxResults": maxResults,
|
||||
"timeRange": timeRange,
|
||||
"country": country,
|
||||
"language": language,
|
||||
"instructions": "Find and analyze recent news articles and return a JSON response with 'articles' array containing objects with 'title', 'url', 'content', 'date', 'source', and 'summary' fields."
|
||||
}
|
||||
|
||||
import json
|
||||
prompt = json.dumps(promptData, indent=2)
|
||||
|
||||
# Call AI service through unified path
|
||||
result = await self.services.ai.callAiDocuments(
|
||||
prompt=prompt,
|
||||
documents=None,
|
||||
options=options,
|
||||
outputFormat="json"
|
||||
)
|
||||
|
||||
# Create meaningful filename
|
||||
meaningfulName = self._generateMeaningfulFileName(
|
||||
base_name="web_news",
|
||||
extension="json",
|
||||
action_name="news"
|
||||
)
|
||||
|
||||
from modules.datamodels.datamodelChat import ActionDocument
|
||||
actionDocument = ActionDocument(
|
||||
documentName=meaningfulName,
|
||||
documentData=result,
|
||||
mimeType="application/json"
|
||||
)
|
||||
|
||||
return ActionResult.isSuccess(documents=[actionDocument])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in web news: {str(e)}")
|
||||
return ActionResult.isFailure(error=str(e))
|
||||
|
||||
|
||||
@action
|
||||
async def generateImage(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||
"""
|
||||
GENERAL:
|
||||
- Purpose: Generate images using AI based on text prompts.
|
||||
- Input requirements: prompt (required); optional size, quality, style.
|
||||
- Output format: Base64 encoded image data.
|
||||
|
||||
Parameters:
|
||||
- prompt (str, required): Text description of the image to generate.
|
||||
- size (str, optional): Image size. Options: 1024x1024, 1792x1024, 1024x1792. Default: 1024x1024.
|
||||
- quality (str, optional): Image quality. Options: standard, hd. Default: standard.
|
||||
- style (str, optional): Image style. Options: vivid, natural. Default: vivid.
|
||||
"""
|
||||
try:
|
||||
prompt = parameters.get("prompt")
|
||||
if not prompt:
|
||||
return ActionResult.isFailure(error="Image prompt is required")
|
||||
|
||||
# Extract optional parameters
|
||||
size = parameters.get("size", "1024x1024")
|
||||
quality = parameters.get("quality", "standard")
|
||||
style = parameters.get("style", "vivid")
|
||||
|
||||
# Build AI call options for image generation
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.IMAGE_GENERATE,
|
||||
resultFormat="base64"
|
||||
)
|
||||
|
||||
# Create unified prompt JSON for image generation
|
||||
promptData = {
|
||||
"prompt": prompt,
|
||||
"size": size,
|
||||
"quality": quality,
|
||||
"style": style,
|
||||
"instructions": "Generate an image based on the prompt and return the base64 encoded image data."
|
||||
}
|
||||
|
||||
import json
|
||||
promptJson = json.dumps(promptData, indent=2)
|
||||
|
||||
# Call AI service through unified path
|
||||
result = await self.services.ai.callAiDocuments(
|
||||
prompt=promptJson,
|
||||
documents=None,
|
||||
options=options,
|
||||
outputFormat="base64"
|
||||
)
|
||||
|
||||
# Create meaningful filename
|
||||
meaningfulName = self._generateMeaningfulFileName(
|
||||
base_name="generated_image",
|
||||
extension="png",
|
||||
action_name="generate"
|
||||
)
|
||||
|
||||
from modules.datamodels.datamodelChat import ActionDocument
|
||||
actionDocument = ActionDocument(
|
||||
documentName=meaningfulName,
|
||||
documentData=result,
|
||||
mimeType="image/png"
|
||||
)
|
||||
|
||||
return ActionResult.isSuccess(documents=[actionDocument])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in image generation: {str(e)}")
|
||||
return ActionResult.isFailure(error=str(e))
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue