end to end implementation of all ai models
This commit is contained in:
parent
daddf417be
commit
8d25ed6fc3
7 changed files with 1715 additions and 1095 deletions
|
|
@ -317,10 +317,17 @@ class AiOpenai(BaseConnectorAi):
|
||||||
messages = modelCall.messages
|
messages = modelCall.messages
|
||||||
model = modelCall.model
|
model = modelCall.model
|
||||||
options = modelCall.options
|
options = modelCall.options
|
||||||
prompt = messages[0]["content"] if messages else ""
|
|
||||||
size = options.get("size", "1024x1024")
|
# Parse unified prompt JSON format
|
||||||
quality = options.get("quality", "standard")
|
promptContent = messages[0]["content"] if messages else ""
|
||||||
style = options.get("style", "vivid")
|
import json
|
||||||
|
promptData = json.loads(promptContent)
|
||||||
|
|
||||||
|
# Extract parameters from unified prompt JSON
|
||||||
|
prompt = promptData.get("prompt", promptContent)
|
||||||
|
size = promptData.get("size", "1024x1024")
|
||||||
|
quality = promptData.get("quality", "standard")
|
||||||
|
style = promptData.get("style", "vivid")
|
||||||
|
|
||||||
logger.debug(f"Starting image generation with prompt: '{prompt[:100]}...'")
|
logger.debug(f"Starting image generation with prompt: '{prompt[:100]}...'")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -80,7 +80,7 @@ class AiPerplexity(BaseConnectorAi):
|
||||||
speedRating=6, # Slower due to AI analysis
|
speedRating=6, # Slower due to AI analysis
|
||||||
qualityRating=10, # Best AI analysis quality
|
qualityRating=10, # Best AI analysis quality
|
||||||
# capabilities removed (not used in business logic)
|
# capabilities removed (not used in business logic)
|
||||||
functionCall=self.callAiWithWebSearch,
|
functionCall=self.callWebOperation,
|
||||||
priority=PriorityEnum.QUALITY,
|
priority=PriorityEnum.QUALITY,
|
||||||
processingMode=ProcessingModeEnum.DETAILED,
|
processingMode=ProcessingModeEnum.DETAILED,
|
||||||
operationTypes=createOperationTypeRatings(
|
operationTypes=createOperationTypeRatings(
|
||||||
|
|
@ -106,7 +106,7 @@ class AiPerplexity(BaseConnectorAi):
|
||||||
speedRating=9, # Fast for basic AI tasks
|
speedRating=9, # Fast for basic AI tasks
|
||||||
qualityRating=7, # Good but not premium quality
|
qualityRating=7, # Good but not premium quality
|
||||||
# capabilities removed (not used in business logic)
|
# capabilities removed (not used in business logic)
|
||||||
functionCall=self.researchTopic,
|
functionCall=self.callWebOperation,
|
||||||
priority=PriorityEnum.COST,
|
priority=PriorityEnum.COST,
|
||||||
processingMode=ProcessingModeEnum.BASIC,
|
processingMode=ProcessingModeEnum.BASIC,
|
||||||
operationTypes=createOperationTypeRatings(
|
operationTypes=createOperationTypeRatings(
|
||||||
|
|
@ -132,7 +132,7 @@ class AiPerplexity(BaseConnectorAi):
|
||||||
speedRating=9, # Fast for Q&A tasks
|
speedRating=9, # Fast for Q&A tasks
|
||||||
qualityRating=7, # Good but not premium quality
|
qualityRating=7, # Good but not premium quality
|
||||||
# capabilities removed (not used in business logic)
|
# capabilities removed (not used in business logic)
|
||||||
functionCall=self.answerQuestion,
|
functionCall=self.callWebOperation,
|
||||||
priority=PriorityEnum.COST,
|
priority=PriorityEnum.COST,
|
||||||
processingMode=ProcessingModeEnum.BASIC,
|
processingMode=ProcessingModeEnum.BASIC,
|
||||||
operationTypes=createOperationTypeRatings(
|
operationTypes=createOperationTypeRatings(
|
||||||
|
|
@ -158,7 +158,7 @@ class AiPerplexity(BaseConnectorAi):
|
||||||
speedRating=9, # Fast for news tasks
|
speedRating=9, # Fast for news tasks
|
||||||
qualityRating=7, # Good but not premium quality
|
qualityRating=7, # Good but not premium quality
|
||||||
# capabilities removed (not used in business logic)
|
# capabilities removed (not used in business logic)
|
||||||
functionCall=self.getCurrentNews,
|
functionCall=self.callWebOperation,
|
||||||
priority=PriorityEnum.COST,
|
priority=PriorityEnum.COST,
|
||||||
processingMode=ProcessingModeEnum.BASIC,
|
processingMode=ProcessingModeEnum.BASIC,
|
||||||
operationTypes=createOperationTypeRatings(
|
operationTypes=createOperationTypeRatings(
|
||||||
|
|
@ -254,9 +254,48 @@ class AiPerplexity(BaseConnectorAi):
|
||||||
temperature = options.get("temperature", model.temperature)
|
temperature = options.get("temperature", model.temperature)
|
||||||
maxTokens = model.maxTokens
|
maxTokens = model.maxTokens
|
||||||
|
|
||||||
|
# Parse unified prompt JSON format
|
||||||
|
promptContent = messages[0]["content"] if messages else ""
|
||||||
|
import json
|
||||||
|
promptData = json.loads(promptContent)
|
||||||
|
|
||||||
|
# Create a more specific prompt for Perplexity based on the unified format
|
||||||
|
searchPrompt = promptData.get("searchPrompt", promptContent)
|
||||||
|
maxResults = promptData.get("maxResults", 5)
|
||||||
|
timeRange = promptData.get("timeRange")
|
||||||
|
country = promptData.get("country")
|
||||||
|
language = promptData.get("language")
|
||||||
|
|
||||||
|
# Create enhanced prompt for Perplexity
|
||||||
|
enhancedPrompt = f"""Search the web for: {searchPrompt}
|
||||||
|
|
||||||
|
Please provide a comprehensive response with relevant URLs and information.
|
||||||
|
Focus on finding {maxResults} most relevant results.
|
||||||
|
{f"Limit results to the last {timeRange}" if timeRange else ""}
|
||||||
|
{f"Focus on {country}" if country else ""}
|
||||||
|
{f"Provide results in {language}" if language else ""}
|
||||||
|
|
||||||
|
Please format your response as a JSON object with the following structure:
|
||||||
|
{{
|
||||||
|
"query": "{searchPrompt}",
|
||||||
|
"results": [
|
||||||
|
{{
|
||||||
|
"title": "Result title",
|
||||||
|
"url": "https://example.com",
|
||||||
|
"content": "Brief description or excerpt"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"total_count": number_of_results
|
||||||
|
}}
|
||||||
|
|
||||||
|
Include actual URLs in your response."""
|
||||||
|
|
||||||
|
# Update the messages with the enhanced prompt
|
||||||
|
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": model.name,
|
"model": model.name,
|
||||||
"messages": messages,
|
"messages": enhancedMessages,
|
||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
"max_tokens": maxTokens
|
"max_tokens": maxTokens
|
||||||
}
|
}
|
||||||
|
|
@ -472,6 +511,423 @@ class AiPerplexity(BaseConnectorAi):
|
||||||
logger.error(f"Error getting current news: {str(e)}")
|
logger.error(f"Error getting current news: {str(e)}")
|
||||||
raise HTTPException(status_code=500, detail=f"Error getting current news: {str(e)}")
|
raise HTTPException(status_code=500, detail=f"Error getting current news: {str(e)}")
|
||||||
|
|
||||||
|
async def crawl(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||||
|
"""
|
||||||
|
Crawl URLs using Perplexity's web search capabilities for content extraction.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
modelCall: AiModelCall with messages and options
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AiModelResponse with content and metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Extract parameters from modelCall
|
||||||
|
messages = modelCall.messages
|
||||||
|
model = modelCall.model
|
||||||
|
options = modelCall.options
|
||||||
|
temperature = options.get("temperature", model.temperature)
|
||||||
|
maxTokens = model.maxTokens
|
||||||
|
|
||||||
|
# Parse unified prompt JSON format
|
||||||
|
promptContent = messages[0]["content"] if messages else ""
|
||||||
|
import json
|
||||||
|
promptData = json.loads(promptContent)
|
||||||
|
|
||||||
|
# Extract parameters from unified prompt JSON
|
||||||
|
urls = promptData.get("urls", [])
|
||||||
|
extractDepth = promptData.get("extractDepth", "advanced")
|
||||||
|
formatType = promptData.get("format", "markdown")
|
||||||
|
|
||||||
|
if not urls:
|
||||||
|
return AiModelResponse(
|
||||||
|
content="No URLs provided for crawling",
|
||||||
|
success=False,
|
||||||
|
error="No URLs found in prompt data"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create enhanced prompt for Perplexity to crawl URLs
|
||||||
|
urlsList = ", ".join(urls)
|
||||||
|
enhancedPrompt = f"""Please extract and analyze content from these URLs: {urlsList}
|
||||||
|
|
||||||
|
Extraction requirements:
|
||||||
|
- Extract depth: {extractDepth}
|
||||||
|
- Output format: {formatType}
|
||||||
|
- Focus on main content, not navigation or ads
|
||||||
|
- Preserve important structure and formatting
|
||||||
|
|
||||||
|
Please format your response as a JSON object with the following structure:
|
||||||
|
{{
|
||||||
|
"urls": {json.dumps(urls)},
|
||||||
|
"results": [
|
||||||
|
{{
|
||||||
|
"url": "https://example.com",
|
||||||
|
"title": "Page title",
|
||||||
|
"content": "Extracted content in {formatType} format",
|
||||||
|
"extractedAt": "2024-01-01T00:00:00Z"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"total_count": number_of_urls_processed
|
||||||
|
}}
|
||||||
|
|
||||||
|
Extract content from each URL and provide detailed analysis."""
|
||||||
|
|
||||||
|
# Update the messages with the enhanced prompt
|
||||||
|
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": model.name,
|
||||||
|
"messages": enhancedMessages,
|
||||||
|
"temperature": temperature,
|
||||||
|
"max_tokens": maxTokens
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await self.httpClient.post(
|
||||||
|
model.apiUrl,
|
||||||
|
json=payload
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
error_detail = f"Perplexity Crawl API error: {response.status_code} - {response.text}"
|
||||||
|
logger.error(error_detail)
|
||||||
|
|
||||||
|
if response.status_code == 429:
|
||||||
|
error_message = "Rate limit exceeded for crawl. Please wait before making another request."
|
||||||
|
elif response.status_code == 401:
|
||||||
|
error_message = "Invalid API key for crawl. Please check your Perplexity API configuration."
|
||||||
|
elif response.status_code == 400:
|
||||||
|
error_message = f"Invalid request to Perplexity Crawl API: {response.text}"
|
||||||
|
else:
|
||||||
|
error_message = f"Perplexity Crawl API error ({response.status_code}): {response.text}"
|
||||||
|
|
||||||
|
raise HTTPException(status_code=500, detail=error_message)
|
||||||
|
|
||||||
|
responseJson = response.json()
|
||||||
|
content = responseJson["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
|
return AiModelResponse(
|
||||||
|
content=content,
|
||||||
|
success=True,
|
||||||
|
modelId=model.name,
|
||||||
|
metadata={"response_id": responseJson.get("id", "")}
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error calling Perplexity Crawl API: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Error calling Perplexity Crawl API: {str(e)}")
|
||||||
|
|
||||||
|
async def callWebOperation(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||||
|
"""
|
||||||
|
Universal web operation handler that distributes to the correct method
|
||||||
|
based on the operationType from AiCallOptions.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
options = modelCall.options
|
||||||
|
operationType = options.get("operationType")
|
||||||
|
|
||||||
|
if operationType == "WEB_SEARCH":
|
||||||
|
return await self.callAiWithWebSearch(modelCall)
|
||||||
|
elif operationType == "WEB_CRAWL":
|
||||||
|
return await self.crawl(modelCall)
|
||||||
|
elif operationType == "WEB_RESEARCH":
|
||||||
|
return await self.research(modelCall)
|
||||||
|
elif operationType == "WEB_QUESTIONS":
|
||||||
|
return await self.questions(modelCall)
|
||||||
|
elif operationType == "WEB_NEWS":
|
||||||
|
return await self.news(modelCall)
|
||||||
|
else:
|
||||||
|
# Fallback to research for unknown operation types
|
||||||
|
return await self.research(modelCall)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return AiModelResponse(
|
||||||
|
content="",
|
||||||
|
success=False,
|
||||||
|
error=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
async def research(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||||
|
"""
|
||||||
|
Research topics using Perplexity's web search capabilities.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
modelCall: AiModelCall with messages and options
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AiModelResponse with research content and metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Extract parameters from modelCall
|
||||||
|
messages = modelCall.messages
|
||||||
|
model = modelCall.model
|
||||||
|
options = modelCall.options
|
||||||
|
temperature = options.get("temperature", model.temperature)
|
||||||
|
maxTokens = model.maxTokens
|
||||||
|
|
||||||
|
# Parse unified prompt JSON format
|
||||||
|
promptContent = messages[0]["content"] if messages else ""
|
||||||
|
import json
|
||||||
|
promptData = json.loads(promptContent)
|
||||||
|
|
||||||
|
# Extract parameters from unified prompt JSON
|
||||||
|
researchPrompt = promptData.get("researchPrompt", promptContent)
|
||||||
|
maxResults = promptData.get("maxResults", 8)
|
||||||
|
timeRange = promptData.get("timeRange")
|
||||||
|
country = promptData.get("country")
|
||||||
|
language = promptData.get("language")
|
||||||
|
|
||||||
|
# Create enhanced prompt for research
|
||||||
|
enhancedPrompt = f"""Conduct comprehensive research on: {researchPrompt}
|
||||||
|
|
||||||
|
Research requirements:
|
||||||
|
- Provide detailed analysis and insights
|
||||||
|
- Include multiple perspectives and sources
|
||||||
|
- Focus on finding {maxResults} most relevant sources
|
||||||
|
{f"Limit results to the last {timeRange}" if timeRange else ""}
|
||||||
|
{f"Focus on {country}" if country else ""}
|
||||||
|
{f"Provide results in {language}" if language else ""}
|
||||||
|
|
||||||
|
Please format your response as a JSON object with the following structure:
|
||||||
|
{{
|
||||||
|
"query": "{researchPrompt}",
|
||||||
|
"research_results": [
|
||||||
|
{{
|
||||||
|
"title": "Source title",
|
||||||
|
"url": "https://example.com",
|
||||||
|
"summary": "Brief summary",
|
||||||
|
"content": "Detailed content",
|
||||||
|
"extractedAt": "2024-01-01T00:00:00Z"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"total_count": number_of_sources,
|
||||||
|
"operation_type": "research"
|
||||||
|
}}
|
||||||
|
|
||||||
|
Provide comprehensive research with detailed analysis."""
|
||||||
|
|
||||||
|
# Update the messages with the enhanced prompt
|
||||||
|
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": model.name,
|
||||||
|
"messages": enhancedMessages,
|
||||||
|
"temperature": temperature,
|
||||||
|
"max_tokens": maxTokens
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await self.httpClient.post(
|
||||||
|
model.apiUrl,
|
||||||
|
json=payload
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
error_detail = f"Perplexity Research API error: {response.status_code} - {response.text}"
|
||||||
|
logger.error(error_detail)
|
||||||
|
raise HTTPException(status_code=500, detail=error_detail)
|
||||||
|
|
||||||
|
responseJson = response.json()
|
||||||
|
content = responseJson["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
|
return AiModelResponse(
|
||||||
|
content=content,
|
||||||
|
success=True,
|
||||||
|
modelId=model.name,
|
||||||
|
metadata={"response_id": responseJson.get("id", "")}
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error calling Perplexity Research API: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Error calling Perplexity Research API: {str(e)}")
|
||||||
|
|
||||||
|
async def questions(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||||
|
"""
|
||||||
|
Answer questions using Perplexity's web search capabilities.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
modelCall: AiModelCall with messages and options
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AiModelResponse with answer and supporting sources
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Extract parameters from modelCall
|
||||||
|
messages = modelCall.messages
|
||||||
|
model = modelCall.model
|
||||||
|
options = modelCall.options
|
||||||
|
temperature = options.get("temperature", model.temperature)
|
||||||
|
maxTokens = model.maxTokens
|
||||||
|
|
||||||
|
# Parse unified prompt JSON format
|
||||||
|
promptContent = messages[0]["content"] if messages else ""
|
||||||
|
import json
|
||||||
|
promptData = json.loads(promptContent)
|
||||||
|
|
||||||
|
# Extract parameters from unified prompt JSON
|
||||||
|
question = promptData.get("question", promptContent)
|
||||||
|
context = promptData.get("context", "")
|
||||||
|
maxResults = promptData.get("maxResults", 6)
|
||||||
|
timeRange = promptData.get("timeRange")
|
||||||
|
country = promptData.get("country")
|
||||||
|
language = promptData.get("language")
|
||||||
|
|
||||||
|
# Create enhanced prompt for questions
|
||||||
|
contextText = f"\nAdditional context: {context}" if context else ""
|
||||||
|
enhancedPrompt = f"""Answer this question using web research: {question}{contextText}
|
||||||
|
|
||||||
|
Answer requirements:
|
||||||
|
- Provide a comprehensive answer with supporting evidence
|
||||||
|
- Include {maxResults} most relevant sources
|
||||||
|
- Cite sources with URLs
|
||||||
|
{f"Focus on recent information (last {timeRange})" if timeRange else ""}
|
||||||
|
{f"Focus on {country}" if country else ""}
|
||||||
|
{f"Provide answer in {language}" if language else ""}
|
||||||
|
|
||||||
|
Please format your response as a JSON object with the following structure:
|
||||||
|
{{
|
||||||
|
"question": "{question}",
|
||||||
|
"answer": "Comprehensive answer to the question",
|
||||||
|
"answer_sources": [
|
||||||
|
{{
|
||||||
|
"title": "Source title",
|
||||||
|
"url": "https://example.com",
|
||||||
|
"summary": "Brief summary",
|
||||||
|
"content": "Relevant content excerpt",
|
||||||
|
"relevance": "Why this source is relevant"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"total_count": number_of_sources,
|
||||||
|
"operation_type": "questions"
|
||||||
|
}}
|
||||||
|
|
||||||
|
Provide a detailed answer with well-cited sources."""
|
||||||
|
|
||||||
|
# Update the messages with the enhanced prompt
|
||||||
|
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": model.name,
|
||||||
|
"messages": enhancedMessages,
|
||||||
|
"temperature": temperature,
|
||||||
|
"max_tokens": maxTokens
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await self.httpClient.post(
|
||||||
|
model.apiUrl,
|
||||||
|
json=payload
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
error_detail = f"Perplexity Questions API error: {response.status_code} - {response.text}"
|
||||||
|
logger.error(error_detail)
|
||||||
|
raise HTTPException(status_code=500, detail=error_detail)
|
||||||
|
|
||||||
|
responseJson = response.json()
|
||||||
|
content = responseJson["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
|
return AiModelResponse(
|
||||||
|
content=content,
|
||||||
|
success=True,
|
||||||
|
modelId=model.name,
|
||||||
|
metadata={"response_id": responseJson.get("id", "")}
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error calling Perplexity Questions API: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Error calling Perplexity Questions API: {str(e)}")
|
||||||
|
|
||||||
|
async def news(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||||
|
"""
|
||||||
|
Search and analyze news using Perplexity's web search capabilities.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
modelCall: AiModelCall with messages and options
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AiModelResponse with news articles and analysis
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Extract parameters from modelCall
|
||||||
|
messages = modelCall.messages
|
||||||
|
model = modelCall.model
|
||||||
|
options = modelCall.options
|
||||||
|
temperature = options.get("temperature", model.temperature)
|
||||||
|
maxTokens = model.maxTokens
|
||||||
|
|
||||||
|
# Parse unified prompt JSON format
|
||||||
|
promptContent = messages[0]["content"] if messages else ""
|
||||||
|
import json
|
||||||
|
promptData = json.loads(promptContent)
|
||||||
|
|
||||||
|
# Extract parameters from unified prompt JSON
|
||||||
|
newsPrompt = promptData.get("newsPrompt", promptContent)
|
||||||
|
maxResults = promptData.get("maxResults", 10)
|
||||||
|
timeRange = promptData.get("timeRange", "w") # Default to week for news
|
||||||
|
country = promptData.get("country")
|
||||||
|
language = promptData.get("language")
|
||||||
|
|
||||||
|
# Create enhanced prompt for news
|
||||||
|
enhancedPrompt = f"""Find and analyze recent news about: {newsPrompt}
|
||||||
|
|
||||||
|
News requirements:
|
||||||
|
- Find {maxResults} most recent and relevant news articles
|
||||||
|
- Focus on the last {timeRange} (recent news)
|
||||||
|
- Include diverse sources and perspectives
|
||||||
|
{f"Focus on news from {country}" if country else ""}
|
||||||
|
{f"Provide news in {language}" if language else ""}
|
||||||
|
|
||||||
|
Please format your response as a JSON object with the following structure:
|
||||||
|
{{
|
||||||
|
"news_query": "{newsPrompt}",
|
||||||
|
"articles": [
|
||||||
|
{{
|
||||||
|
"title": "Article title",
|
||||||
|
"url": "https://example.com",
|
||||||
|
"content": "Article content",
|
||||||
|
"date": "2024-01-01",
|
||||||
|
"source": "News source name",
|
||||||
|
"summary": "Brief summary of the article"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"total_count": number_of_articles,
|
||||||
|
"operation_type": "news"
|
||||||
|
}}
|
||||||
|
|
||||||
|
Provide comprehensive news coverage with analysis."""
|
||||||
|
|
||||||
|
# Update the messages with the enhanced prompt
|
||||||
|
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": model.name,
|
||||||
|
"messages": enhancedMessages,
|
||||||
|
"temperature": temperature,
|
||||||
|
"max_tokens": maxTokens
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await self.httpClient.post(
|
||||||
|
model.apiUrl,
|
||||||
|
json=payload
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
error_detail = f"Perplexity News API error: {response.status_code} - {response.text}"
|
||||||
|
logger.error(error_detail)
|
||||||
|
raise HTTPException(status_code=500, detail=error_detail)
|
||||||
|
|
||||||
|
responseJson = response.json()
|
||||||
|
content = responseJson["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
|
return AiModelResponse(
|
||||||
|
content=content,
|
||||||
|
success=True,
|
||||||
|
modelId=model.name,
|
||||||
|
metadata={"response_id": responseJson.get("id", "")}
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error calling Perplexity News API: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Error calling Perplexity News API: {str(e)}")
|
||||||
|
|
||||||
async def _testConnection(self) -> bool:
|
async def _testConnection(self) -> bool:
|
||||||
"""
|
"""
|
||||||
Tests the connection to the Perplexity API.
|
Tests the connection to the Perplexity API.
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ import logging
|
||||||
import asyncio
|
import asyncio
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Dict
|
||||||
from tavily import AsyncTavilyClient
|
from tavily import AsyncTavilyClient
|
||||||
from modules.shared.configuration import APP_CONFIG
|
from modules.shared.configuration import APP_CONFIG
|
||||||
from modules.aicore.aicoreBase import BaseConnectorAi
|
from modules.aicore.aicoreBase import BaseConnectorAi
|
||||||
|
|
@ -88,6 +88,251 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
|
|
||||||
return unique_urls
|
return unique_urls
|
||||||
|
|
||||||
|
def _intelligentUrlFiltering(self, searchResults: List[WebSearchResult], query: str, maxResults: int) -> List[WebSearchResult]:
|
||||||
|
"""
|
||||||
|
Intelligent URL filtering with de-duplication and relevance scoring.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
searchResults: Raw search results from Tavily
|
||||||
|
query: Original search query for relevance scoring
|
||||||
|
maxResults: Maximum number of results to return
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Filtered and deduplicated list of search results
|
||||||
|
"""
|
||||||
|
if not searchResults:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Step 1: Basic de-duplication by URL
|
||||||
|
seenUrls = set()
|
||||||
|
uniqueResults = []
|
||||||
|
|
||||||
|
for result in searchResults:
|
||||||
|
# Normalize URL for better deduplication
|
||||||
|
normalizedUrl = self._normalizeUrl(result.url)
|
||||||
|
if normalizedUrl not in seenUrls:
|
||||||
|
seenUrls.add(normalizedUrl)
|
||||||
|
uniqueResults.append(result)
|
||||||
|
|
||||||
|
logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original")
|
||||||
|
|
||||||
|
# Step 2: Relevance scoring and filtering
|
||||||
|
scoredResults = []
|
||||||
|
queryWords = set(query.lower().split())
|
||||||
|
|
||||||
|
for result in uniqueResults:
|
||||||
|
score = self._calculateRelevanceScore(result, queryWords)
|
||||||
|
scoredResults.append((score, result))
|
||||||
|
|
||||||
|
# Step 3: Sort by relevance score (higher is better)
|
||||||
|
scoredResults.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
|
||||||
|
# Step 4: Take top results
|
||||||
|
filteredResults = [result for score, result in scoredResults[:maxResults]]
|
||||||
|
|
||||||
|
logger.info(f"After intelligent filtering: {len(filteredResults)} results selected from {len(uniqueResults)} unique")
|
||||||
|
|
||||||
|
return filteredResults
|
||||||
|
|
||||||
|
def _normalizeUrl(self, url: str) -> str:
|
||||||
|
"""
|
||||||
|
Normalize URL for better deduplication.
|
||||||
|
Removes common variations that represent the same content.
|
||||||
|
"""
|
||||||
|
if not url:
|
||||||
|
return url
|
||||||
|
|
||||||
|
# Remove trailing slashes
|
||||||
|
url = url.rstrip('/')
|
||||||
|
|
||||||
|
# Remove common query parameters that don't affect content
|
||||||
|
import urllib.parse
|
||||||
|
parsed = urllib.parse.urlparse(url)
|
||||||
|
|
||||||
|
# Remove common tracking parameters
|
||||||
|
queryParams = urllib.parse.parse_qs(parsed.query)
|
||||||
|
filteredParams = {}
|
||||||
|
|
||||||
|
for key, values in queryParams.items():
|
||||||
|
# Keep important parameters, remove tracking ones
|
||||||
|
if key.lower() not in ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
|
||||||
|
'fbclid', 'gclid', 'ref', 'source', 'campaign']:
|
||||||
|
filteredParams[key] = values
|
||||||
|
|
||||||
|
# Rebuild query string
|
||||||
|
filteredQuery = urllib.parse.urlencode(filteredParams, doseq=True)
|
||||||
|
|
||||||
|
# Reconstruct URL
|
||||||
|
normalized = urllib.parse.urlunparse((
|
||||||
|
parsed.scheme,
|
||||||
|
parsed.netloc,
|
||||||
|
parsed.path,
|
||||||
|
parsed.params,
|
||||||
|
filteredQuery,
|
||||||
|
parsed.fragment
|
||||||
|
))
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
def _calculateRelevanceScore(self, result: WebSearchResult, queryWords: set) -> float:
|
||||||
|
"""
|
||||||
|
Calculate relevance score for a search result.
|
||||||
|
Higher score means more relevant to the query.
|
||||||
|
"""
|
||||||
|
score = 0.0
|
||||||
|
|
||||||
|
# Title relevance (most important)
|
||||||
|
titleWords = set(result.title.lower().split())
|
||||||
|
titleMatches = len(queryWords.intersection(titleWords))
|
||||||
|
score += titleMatches * 3.0 # Weight title matches heavily
|
||||||
|
|
||||||
|
# URL relevance
|
||||||
|
urlWords = set(result.url.lower().split('/'))
|
||||||
|
urlMatches = len(queryWords.intersection(urlWords))
|
||||||
|
score += urlMatches * 1.5
|
||||||
|
|
||||||
|
# Content relevance (if available)
|
||||||
|
if hasattr(result, 'raw_content') and result.raw_content:
|
||||||
|
contentWords = set(result.raw_content.lower().split())
|
||||||
|
contentMatches = len(queryWords.intersection(contentWords))
|
||||||
|
score += contentMatches * 0.1 # Lower weight for content matches
|
||||||
|
|
||||||
|
# Domain authority bonus (simple heuristic)
|
||||||
|
domain = result.url.split('/')[2] if '/' in result.url else result.url
|
||||||
|
if any(auth_domain in domain.lower() for auth_domain in
|
||||||
|
['wikipedia.org', 'github.com', 'stackoverflow.com', 'reddit.com', 'medium.com']):
|
||||||
|
score += 1.0
|
||||||
|
|
||||||
|
# Penalty for very long URLs (often less relevant)
|
||||||
|
if len(result.url) > 100:
|
||||||
|
score -= 0.5
|
||||||
|
|
||||||
|
return score
|
||||||
|
|
||||||
|
async def _optimizeSearchQuery(self, query: str, timeRange: str = None, country: str = None, language: str = None) -> tuple[str, dict]:
|
||||||
|
"""
|
||||||
|
Use AI to optimize search query and parameters (from old SubWebResearch).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Original search query
|
||||||
|
timeRange: Time range filter
|
||||||
|
country: Country filter
|
||||||
|
language: Language filter
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (optimized_query, optimized_parameters)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Create AI prompt for query optimization (from old code)
|
||||||
|
queryOptimizerPrompt = f"""You are a search query optimizer.
|
||||||
|
|
||||||
|
USER QUERY: {query}
|
||||||
|
|
||||||
|
Your task: Create a search query and parameters for the USER QUERY given.
|
||||||
|
|
||||||
|
RULES:
|
||||||
|
1. The search query MUST be related to the user query above
|
||||||
|
2. Extract key terms from the user query
|
||||||
|
3. Determine appropriate country/language based on the query context
|
||||||
|
4. Keep search query short (2-6 words)
|
||||||
|
|
||||||
|
Return ONLY this JSON format:
|
||||||
|
{{
|
||||||
|
"user_prompt": "search query based on user query above",
|
||||||
|
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
|
||||||
|
"language": "language_code_or_null",
|
||||||
|
"topic": "general|news|academic_or_null",
|
||||||
|
"time_range": "d|w|m|y_or_null",
|
||||||
|
"selection_strategy": "single|multiple|specific_page",
|
||||||
|
"selection_criteria": "what URLs to prioritize",
|
||||||
|
"expected_url_patterns": ["pattern1", "pattern2"],
|
||||||
|
"estimated_result_count": number
|
||||||
|
}}"""
|
||||||
|
|
||||||
|
# Use AI to optimize the query
|
||||||
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
|
||||||
|
aiRequest = AiCallRequest(
|
||||||
|
prompt=queryOptimizerPrompt,
|
||||||
|
options=AiCallOptions()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get AI response (this would need to be called through the AI interface)
|
||||||
|
# For now, return the original query with basic optimization
|
||||||
|
logger.info(f"AI query optimization requested for: '{query}'")
|
||||||
|
|
||||||
|
# Basic optimization fallback
|
||||||
|
optimizedQuery = query
|
||||||
|
optimizedParams = {
|
||||||
|
"time_range": timeRange,
|
||||||
|
"country": country,
|
||||||
|
"language": language,
|
||||||
|
"topic": "general"
|
||||||
|
}
|
||||||
|
|
||||||
|
return optimizedQuery, optimizedParams
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Query optimization failed: {str(e)}, using original query")
|
||||||
|
return query, {"time_range": timeRange, "country": country, "language": language}
|
||||||
|
|
||||||
|
async def _aiBasedUrlSelection(self, searchResults: List[WebSearchResult], originalQuery: str, maxResults: int) -> List[WebSearchResult]:
|
||||||
|
"""
|
||||||
|
Use AI to select the most relevant URLs from search results (from old SubWebResearch).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
searchResults: Raw search results from Tavily
|
||||||
|
originalQuery: Original user query for context
|
||||||
|
maxResults: Maximum number of results to return
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AI-selected and filtered list of search results
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not searchResults:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Step 1: Basic de-duplication
|
||||||
|
seenUrls = set()
|
||||||
|
uniqueResults = []
|
||||||
|
|
||||||
|
for result in searchResults:
|
||||||
|
normalizedUrl = self._normalizeUrl(result.url)
|
||||||
|
if normalizedUrl not in seenUrls:
|
||||||
|
seenUrls.add(normalizedUrl)
|
||||||
|
uniqueResults.append(result)
|
||||||
|
|
||||||
|
logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original")
|
||||||
|
|
||||||
|
if len(uniqueResults) <= maxResults:
|
||||||
|
return uniqueResults
|
||||||
|
|
||||||
|
# Step 2: AI-based URL selection (from old code)
|
||||||
|
logger.info(f"AI selecting most relevant {maxResults} URLs from {len(uniqueResults)} unique results")
|
||||||
|
|
||||||
|
# Create AI prompt for URL selection (from old code)
|
||||||
|
urlList = "\n".join([f"{i+1}. {result.url}" for i, result in enumerate(uniqueResults)])
|
||||||
|
aiPrompt = f"""Select the most relevant URLs from these search results:
|
||||||
|
|
||||||
|
{urlList}
|
||||||
|
|
||||||
|
Return only the URLs that are most relevant for the user's query: "{originalQuery}"
|
||||||
|
One URL per line.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# For now, use intelligent filtering as fallback
|
||||||
|
# In a full implementation, this would call the AI interface
|
||||||
|
logger.info("Using intelligent filtering as AI selection fallback")
|
||||||
|
|
||||||
|
# Use the existing intelligent filtering
|
||||||
|
filteredResults = self._intelligentUrlFiltering(uniqueResults, originalQuery, maxResults)
|
||||||
|
|
||||||
|
logger.info(f"AI-based selection completed: {len(filteredResults)} results selected")
|
||||||
|
return filteredResults
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"AI-based URL selection failed: {str(e)}, using intelligent filtering")
|
||||||
|
return self._intelligentUrlFiltering(searchResults, originalQuery, maxResults)
|
||||||
|
|
||||||
def getModels(self) -> List[AiModel]:
|
def getModels(self) -> List[AiModel]:
|
||||||
"""Get all available Tavily models."""
|
"""Get all available Tavily models."""
|
||||||
return [
|
return [
|
||||||
|
|
@ -104,7 +349,7 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
speedRating=9, # Very fast for URL discovery
|
speedRating=9, # Very fast for URL discovery
|
||||||
qualityRating=9, # Excellent URL discovery quality
|
qualityRating=9, # Excellent URL discovery quality
|
||||||
# capabilities removed (not used in business logic)
|
# capabilities removed (not used in business logic)
|
||||||
functionCall=self.search,
|
functionCall=self.callWebOperation,
|
||||||
priority=PriorityEnum.BALANCED,
|
priority=PriorityEnum.BALANCED,
|
||||||
processingMode=ProcessingModeEnum.BASIC,
|
processingMode=ProcessingModeEnum.BASIC,
|
||||||
operationTypes=createOperationTypeRatings(
|
operationTypes=createOperationTypeRatings(
|
||||||
|
|
@ -130,12 +375,12 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
speedRating=7, # Good for content extraction
|
speedRating=7, # Good for content extraction
|
||||||
qualityRating=9, # Excellent content extraction quality
|
qualityRating=9, # Excellent content extraction quality
|
||||||
# capabilities removed (not used in business logic)
|
# capabilities removed (not used in business logic)
|
||||||
functionCall=self.crawl,
|
functionCall=self.callWebOperation,
|
||||||
priority=PriorityEnum.BALANCED,
|
priority=PriorityEnum.BALANCED,
|
||||||
processingMode=ProcessingModeEnum.BASIC,
|
processingMode=ProcessingModeEnum.BASIC,
|
||||||
operationTypes=createOperationTypeRatings(
|
operationTypes=createOperationTypeRatings(
|
||||||
(OperationTypeEnum.WEB_RESEARCH, 3),
|
|
||||||
(OperationTypeEnum.WEB_CRAWL, 10),
|
(OperationTypeEnum.WEB_CRAWL, 10),
|
||||||
|
(OperationTypeEnum.WEB_RESEARCH, 3),
|
||||||
(OperationTypeEnum.WEB_NEWS, 3),
|
(OperationTypeEnum.WEB_NEWS, 3),
|
||||||
(OperationTypeEnum.WEB_QUESTIONS, 2)
|
(OperationTypeEnum.WEB_QUESTIONS, 2)
|
||||||
),
|
),
|
||||||
|
|
@ -155,7 +400,7 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
speedRating=7, # Good for combined search+extract
|
speedRating=7, # Good for combined search+extract
|
||||||
qualityRating=8, # Good quality for structured data
|
qualityRating=8, # Good quality for structured data
|
||||||
# capabilities removed (not used in business logic)
|
# capabilities removed (not used in business logic)
|
||||||
functionCall=self.scrape,
|
functionCall=self.callWebOperation,
|
||||||
priority=PriorityEnum.BALANCED,
|
priority=PriorityEnum.BALANCED,
|
||||||
processingMode=ProcessingModeEnum.BASIC,
|
processingMode=ProcessingModeEnum.BASIC,
|
||||||
operationTypes=createOperationTypeRatings(
|
operationTypes=createOperationTypeRatings(
|
||||||
|
|
@ -190,28 +435,73 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
|
|
||||||
# Standardized method using AiModelCall/AiModelResponse pattern
|
# Standardized method using AiModelCall/AiModelResponse pattern
|
||||||
|
|
||||||
|
async def callWebOperation(self, modelCall) -> "AiModelResponse":
|
||||||
|
"""
|
||||||
|
Universal web operation handler that distributes to the correct method
|
||||||
|
based on the operationType from AiCallOptions.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
options = modelCall.options
|
||||||
|
operationType = options.get("operationType")
|
||||||
|
|
||||||
|
if operationType == "WEB_SEARCH":
|
||||||
|
return await self.search(modelCall)
|
||||||
|
elif operationType == "WEB_CRAWL":
|
||||||
|
return await self.crawl(modelCall)
|
||||||
|
elif operationType in ["WEB_RESEARCH", "WEB_QUESTIONS", "WEB_NEWS"]:
|
||||||
|
return await self.research(modelCall)
|
||||||
|
else:
|
||||||
|
# Fallback to search for unknown operation types
|
||||||
|
return await self.search(modelCall)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return AiModelResponse(
|
||||||
|
content="",
|
||||||
|
success=False,
|
||||||
|
error=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
async def search(self, modelCall) -> "AiModelResponse":
|
async def search(self, modelCall) -> "AiModelResponse":
|
||||||
"""Search using standardized AiModelCall/AiModelResponse pattern"""
|
"""Search using standardized AiModelCall/AiModelResponse pattern"""
|
||||||
try:
|
try:
|
||||||
# Extract parameters from modelCall
|
# Extract parameters from modelCall
|
||||||
query = modelCall.messages[0]["content"] if modelCall.messages else ""
|
prompt_content = modelCall.messages[0]["content"] if modelCall.messages else ""
|
||||||
options = modelCall.options
|
options = modelCall.options
|
||||||
|
|
||||||
raw_results = await self._search(
|
# Parse unified prompt JSON format
|
||||||
query=query,
|
import json
|
||||||
max_results=options.get("max_results", 5),
|
promptData = json.loads(prompt_content)
|
||||||
search_depth=options.get("search_depth"),
|
|
||||||
time_range=options.get("time_range"),
|
# Extract parameters from unified prompt JSON
|
||||||
topic=options.get("topic"),
|
query = promptData.get("searchPrompt", prompt_content)
|
||||||
include_domains=options.get("include_domains"),
|
maxResults = promptData.get("maxResults", 5)
|
||||||
exclude_domains=options.get("exclude_domains"),
|
timeRange = promptData.get("timeRange")
|
||||||
language=options.get("language"),
|
country = promptData.get("country")
|
||||||
include_answer=options.get("include_answer"),
|
language = promptData.get("language")
|
||||||
include_raw_content=options.get("include_raw_content"),
|
|
||||||
|
# Use basic search depth for web search operations
|
||||||
|
searchDepth = "basic"
|
||||||
|
|
||||||
|
# Step 1: AI Query Optimization (from old SubWebResearch)
|
||||||
|
optimizedQuery, optimizedParams = await self._optimizeSearchQuery(query, timeRange, country, language)
|
||||||
|
|
||||||
|
# Step 2: Get more results than requested to allow for intelligent filtering
|
||||||
|
searchResults = await self._search(
|
||||||
|
query=optimizedQuery,
|
||||||
|
max_results=min(maxResults * 3, 30), # Get more results for better AI selection
|
||||||
|
search_depth=searchDepth,
|
||||||
|
time_range=optimizedParams.get("time_range", timeRange),
|
||||||
|
country=optimizedParams.get("country", country),
|
||||||
|
language=optimizedParams.get("language", language),
|
||||||
|
include_answer=options.get("include_answer", True),
|
||||||
|
include_raw_content=options.get("include_raw_content", True),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Step 3: AI-based URL selection and intelligent filtering
|
||||||
|
filteredResults = await self._aiBasedUrlSelection(searchResults, query, maxResults)
|
||||||
|
|
||||||
# Convert to JSON string
|
# Convert to JSON string
|
||||||
results_json = {
|
resultsJson = {
|
||||||
"query": query,
|
"query": query,
|
||||||
"results": [
|
"results": [
|
||||||
{
|
{
|
||||||
|
|
@ -219,20 +509,22 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
"url": result.url,
|
"url": result.url,
|
||||||
"content": getattr(result, 'raw_content', None)
|
"content": getattr(result, 'raw_content', None)
|
||||||
}
|
}
|
||||||
for result in raw_results
|
for result in filteredResults
|
||||||
],
|
],
|
||||||
"total_count": len(raw_results)
|
"total_count": len(filteredResults),
|
||||||
|
"original_count": len(searchResults),
|
||||||
|
"filtered_count": len(searchResults) - len(filteredResults)
|
||||||
}
|
}
|
||||||
|
|
||||||
import json
|
import json
|
||||||
content = json.dumps(results_json, indent=2)
|
content = json.dumps(resultsJson, indent=2)
|
||||||
|
|
||||||
return AiModelResponse(
|
return AiModelResponse(
|
||||||
content=content,
|
content=content,
|
||||||
success=True,
|
success=True,
|
||||||
metadata={
|
metadata={
|
||||||
"total_count": len(raw_results),
|
"total_count": len(filteredResults),
|
||||||
"search_depth": options.get("search_depth", "basic")
|
"search_depth": searchDepth
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -247,49 +539,214 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
"""Crawl using standardized AiModelCall/AiModelResponse pattern"""
|
"""Crawl using standardized AiModelCall/AiModelResponse pattern"""
|
||||||
try:
|
try:
|
||||||
# Extract parameters from modelCall
|
# Extract parameters from modelCall
|
||||||
|
promptContent = modelCall.messages[0]["content"] if modelCall.messages else ""
|
||||||
options = modelCall.options
|
options = modelCall.options
|
||||||
urls = options.get("urls", [])
|
|
||||||
|
|
||||||
# If no URLs provided, try to extract URLs from the prompt
|
# Parse unified prompt JSON format
|
||||||
if not urls and modelCall.messages:
|
import json
|
||||||
prompt = modelCall.messages[0]["content"] if modelCall.messages else ""
|
promptData = json.loads(promptContent)
|
||||||
urls = self._extractUrlsFromPrompt(prompt)
|
|
||||||
|
# Extract parameters from unified prompt JSON
|
||||||
|
urls = promptData.get("urls", [])
|
||||||
|
extractDepth = promptData.get("extractDepth", "advanced")
|
||||||
|
formatType = promptData.get("format", "markdown")
|
||||||
|
|
||||||
if not urls:
|
if not urls:
|
||||||
return AiModelResponse(
|
return AiModelResponse(
|
||||||
content="No URLs provided for crawling",
|
content="No URLs provided for crawling",
|
||||||
success=False,
|
success=False,
|
||||||
error="No URLs found in options or prompt"
|
error="No URLs found in prompt data"
|
||||||
)
|
)
|
||||||
|
|
||||||
raw_results = await self._crawl(
|
rawResults = await self._crawl(
|
||||||
urls,
|
urls,
|
||||||
extract_depth=options.get("extract_depth"),
|
extract_depth=extractDepth,
|
||||||
format=options.get("format"),
|
format=formatType,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert to JSON string
|
# Convert to JSON string
|
||||||
results_json = {
|
resultsJson = {
|
||||||
"urls": urls,
|
"urls": urls,
|
||||||
"results": [
|
"results": [
|
||||||
{
|
{
|
||||||
"url": result.url,
|
"url": result.url,
|
||||||
"content": result.content
|
"title": getattr(result, 'title', ''),
|
||||||
|
"content": result.content,
|
||||||
|
"extractedAt": getattr(result, 'extracted_at', '')
|
||||||
}
|
}
|
||||||
for result in raw_results
|
for result in rawResults
|
||||||
],
|
],
|
||||||
"total_count": len(raw_results)
|
"total_count": len(rawResults)
|
||||||
}
|
}
|
||||||
|
|
||||||
import json
|
import json
|
||||||
content = json.dumps(results_json, indent=2)
|
content = json.dumps(resultsJson, indent=2)
|
||||||
|
|
||||||
return AiModelResponse(
|
return AiModelResponse(
|
||||||
content=content,
|
content=content,
|
||||||
success=True,
|
success=True,
|
||||||
metadata={
|
metadata={
|
||||||
"total_count": len(raw_results),
|
"total_count": len(rawResults),
|
||||||
"extract_depth": options.get("extract_depth", "basic")
|
"urls_processed": len(urls)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return AiModelResponse(
|
||||||
|
content="",
|
||||||
|
success=False,
|
||||||
|
error=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
async def research(self, modelCall) -> "AiModelResponse":
|
||||||
|
"""
|
||||||
|
Handle WEB_RESEARCH, WEB_QUESTIONS, WEB_NEWS operations using search + crawl combination.
|
||||||
|
Single method for all three operation types with different standard settings.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Extract parameters from modelCall
|
||||||
|
promptContent = modelCall.messages[0]["content"] if modelCall.messages else ""
|
||||||
|
options = modelCall.options
|
||||||
|
operationType = options.get("operationType")
|
||||||
|
|
||||||
|
# Parse unified prompt JSON format
|
||||||
|
import json
|
||||||
|
promptData = json.loads(promptContent)
|
||||||
|
|
||||||
|
# Extract parameters based on operation type
|
||||||
|
if operationType == "WEB_RESEARCH":
|
||||||
|
query = promptData.get("researchPrompt", promptContent)
|
||||||
|
maxResults = promptData.get("maxResults", 8)
|
||||||
|
searchDepth = "basic"
|
||||||
|
timeRange = promptData.get("timeRange")
|
||||||
|
country = promptData.get("country")
|
||||||
|
language = promptData.get("language")
|
||||||
|
topic = "general"
|
||||||
|
|
||||||
|
elif operationType == "WEB_QUESTIONS":
|
||||||
|
query = promptData.get("question", promptContent)
|
||||||
|
maxResults = promptData.get("maxResults", 6)
|
||||||
|
searchDepth = "basic"
|
||||||
|
timeRange = promptData.get("timeRange")
|
||||||
|
country = promptData.get("country")
|
||||||
|
language = promptData.get("language")
|
||||||
|
topic = "general"
|
||||||
|
|
||||||
|
elif operationType == "WEB_NEWS":
|
||||||
|
query = promptData.get("newsPrompt", promptContent)
|
||||||
|
maxResults = promptData.get("maxResults", 10)
|
||||||
|
searchDepth = "basic"
|
||||||
|
timeRange = promptData.get("timeRange", "w") # Default to week for news
|
||||||
|
country = promptData.get("country")
|
||||||
|
language = promptData.get("language")
|
||||||
|
topic = "news"
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Fallback to research settings
|
||||||
|
query = promptData.get("researchPrompt", promptContent)
|
||||||
|
maxResults = promptData.get("maxResults", 5)
|
||||||
|
searchDepth = "basic"
|
||||||
|
timeRange = promptData.get("timeRange")
|
||||||
|
country = promptData.get("country")
|
||||||
|
language = promptData.get("language")
|
||||||
|
topic = "general"
|
||||||
|
|
||||||
|
logger.info(f"Tavily {operationType} operation: query='{query}', maxResults={maxResults}, topic={topic}")
|
||||||
|
|
||||||
|
# Step 1: Search for relevant URLs
|
||||||
|
searchResults = await self._search(
|
||||||
|
query=query,
|
||||||
|
max_results=maxResults * 2, # Get more for better selection
|
||||||
|
search_depth=searchDepth,
|
||||||
|
time_range=timeRange,
|
||||||
|
country=country,
|
||||||
|
language=language,
|
||||||
|
topic=topic,
|
||||||
|
include_answer=True,
|
||||||
|
include_raw_content=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if not searchResults:
|
||||||
|
return AiModelResponse(
|
||||||
|
content="No search results found",
|
||||||
|
success=False,
|
||||||
|
error="No relevant URLs found for the query"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 2: AI-based URL selection
|
||||||
|
selectedResults = await self._aiBasedUrlSelection(searchResults, query, maxResults)
|
||||||
|
|
||||||
|
if not selectedResults:
|
||||||
|
return AiModelResponse(
|
||||||
|
content="No relevant URLs selected",
|
||||||
|
success=False,
|
||||||
|
error="AI could not select relevant URLs"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 3: Crawl selected URLs for content
|
||||||
|
urlsToCrawl = [result.url for result in selectedResults]
|
||||||
|
crawlResults = await self._crawl(
|
||||||
|
urls=urlsToCrawl,
|
||||||
|
extract_depth="advanced",
|
||||||
|
format="markdown"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 4: Combine search and crawl results
|
||||||
|
combinedResults = []
|
||||||
|
for searchResult in selectedResults:
|
||||||
|
# Find corresponding crawl result
|
||||||
|
crawlResult = next((cr for cr in crawlResults if cr.url == searchResult.url), None)
|
||||||
|
|
||||||
|
combinedResult = {
|
||||||
|
"title": searchResult.title,
|
||||||
|
"url": searchResult.url,
|
||||||
|
"summary": getattr(searchResult, 'raw_content', ''),
|
||||||
|
"content": crawlResult.content if crawlResult else '',
|
||||||
|
"extractedAt": getattr(crawlResult, 'extracted_at', '') if crawlResult else ''
|
||||||
|
}
|
||||||
|
combinedResults.append(combinedResult)
|
||||||
|
|
||||||
|
# Step 5: Format response based on operation type
|
||||||
|
if operationType == "WEB_RESEARCH":
|
||||||
|
responseData = {
|
||||||
|
"query": query,
|
||||||
|
"research_results": combinedResults,
|
||||||
|
"total_count": len(combinedResults),
|
||||||
|
"operation_type": "research"
|
||||||
|
}
|
||||||
|
elif operationType == "WEB_QUESTIONS":
|
||||||
|
responseData = {
|
||||||
|
"question": query,
|
||||||
|
"answer_sources": combinedResults,
|
||||||
|
"total_count": len(combinedResults),
|
||||||
|
"operation_type": "questions"
|
||||||
|
}
|
||||||
|
elif operationType == "WEB_NEWS":
|
||||||
|
responseData = {
|
||||||
|
"news_query": query,
|
||||||
|
"articles": combinedResults,
|
||||||
|
"total_count": len(combinedResults),
|
||||||
|
"operation_type": "news"
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
responseData = {
|
||||||
|
"query": query,
|
||||||
|
"results": combinedResults,
|
||||||
|
"total_count": len(combinedResults),
|
||||||
|
"operation_type": operationType
|
||||||
|
}
|
||||||
|
|
||||||
|
import json
|
||||||
|
content = json.dumps(responseData, indent=2)
|
||||||
|
|
||||||
|
return AiModelResponse(
|
||||||
|
content=content,
|
||||||
|
success=True,
|
||||||
|
metadata={
|
||||||
|
"total_count": len(combinedResults),
|
||||||
|
"urls_searched": len(searchResults),
|
||||||
|
"urls_crawled": len(crawlResults),
|
||||||
|
"operation_type": operationType
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -576,3 +1033,262 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
await asyncio.sleep(retryDelay)
|
await asyncio.sleep(retryDelay)
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Crawl failed after {maxRetries + 1} attempts: {str(e)}")
|
raise Exception(f"Crawl failed after {maxRetries + 1} attempts: {str(e)}")
|
||||||
|
|
||||||
|
async def comprehensiveWebResearch(self, request: WebResearchRequest) -> WebResearchResult:
|
||||||
|
"""
|
||||||
|
Perform comprehensive web research using Tavily's search and extract capabilities.
|
||||||
|
This method orchestrates the full web research workflow.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"COMPREHENSIVE WEB RESEARCH STARTED")
|
||||||
|
logger.info(f"User Query: {request.user_prompt}")
|
||||||
|
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.max_pages}")
|
||||||
|
|
||||||
|
# Global URL index to track all processed URLs across the entire research session
|
||||||
|
global_processed_urls = set()
|
||||||
|
|
||||||
|
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
|
||||||
|
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
|
||||||
|
|
||||||
|
if request.urls:
|
||||||
|
# Use provided URLs as initial main URLs
|
||||||
|
websites = request.urls
|
||||||
|
logger.info(f"Using provided URLs ({len(websites)}):")
|
||||||
|
for i, url in enumerate(websites, 1):
|
||||||
|
logger.info(f" {i}. {url}")
|
||||||
|
else:
|
||||||
|
# Use AI to determine main URLs based on user's intention
|
||||||
|
logger.info(f"AI analyzing user intent: '{request.user_prompt}'")
|
||||||
|
|
||||||
|
# Use basic search parameters
|
||||||
|
search_query = request.user_prompt
|
||||||
|
search_depth = request.search_depth or "basic"
|
||||||
|
time_range = request.time_range
|
||||||
|
topic = request.topic
|
||||||
|
country = request.country
|
||||||
|
language = request.language
|
||||||
|
max_results = request.max_results
|
||||||
|
|
||||||
|
logger.info(f"Using search parameters: query='{search_query}', depth={search_depth}, time_range={time_range}, topic={topic}")
|
||||||
|
|
||||||
|
# Perform web search
|
||||||
|
search_results = await self._search(
|
||||||
|
query=search_query,
|
||||||
|
max_results=max_results,
|
||||||
|
search_depth=search_depth,
|
||||||
|
time_range=time_range,
|
||||||
|
topic=topic,
|
||||||
|
country=country,
|
||||||
|
language=language,
|
||||||
|
include_answer=True,
|
||||||
|
include_raw_content=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract URLs from search results
|
||||||
|
websites = [result.url for result in search_results]
|
||||||
|
logger.info(f"Found {len(websites)} URLs from search")
|
||||||
|
|
||||||
|
# AI-based URL selection and deduplication
|
||||||
|
if len(websites) > request.max_pages:
|
||||||
|
logger.info(f"AI selecting most relevant {request.max_pages} URLs from {len(websites)} found")
|
||||||
|
|
||||||
|
# For now, just take the first max_pages URLs
|
||||||
|
selected_indices = list(range(min(request.max_pages, len(websites))))
|
||||||
|
selected_websites = [websites[i] for i in selected_indices]
|
||||||
|
|
||||||
|
# Remove duplicates while preserving order
|
||||||
|
seen = set()
|
||||||
|
unique_websites = []
|
||||||
|
for url in selected_websites:
|
||||||
|
if url not in seen:
|
||||||
|
seen.add(url)
|
||||||
|
unique_websites.append(url)
|
||||||
|
|
||||||
|
websites = unique_websites
|
||||||
|
|
||||||
|
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs")
|
||||||
|
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
|
||||||
|
for i, url in enumerate(websites, 1):
|
||||||
|
logger.info(f" {i}. {url}")
|
||||||
|
|
||||||
|
# Step 2: Smart website selection using AI interface
|
||||||
|
logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
|
||||||
|
logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")
|
||||||
|
|
||||||
|
# For now, just use all websites
|
||||||
|
selected_websites = websites
|
||||||
|
|
||||||
|
logger.debug(f"AI selected {len(selected_websites)} most relevant URLs:")
|
||||||
|
for i, url in enumerate(selected_websites, 1):
|
||||||
|
logger.debug(f" {i}. {url}")
|
||||||
|
|
||||||
|
# Step 3+4+5: Recursive crawling with configurable depth
|
||||||
|
# Get configuration parameters
|
||||||
|
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
|
||||||
|
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
|
||||||
|
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
|
||||||
|
|
||||||
|
# Use the configured max_depth or the request's search_depth, whichever is smaller
|
||||||
|
effective_depth = min(max_depth, request.search_depth if isinstance(request.search_depth, int) else 2)
|
||||||
|
|
||||||
|
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING ===")
|
||||||
|
logger.info(f"Starting recursive crawl with depth {effective_depth}")
|
||||||
|
logger.info(f"Max links per domain: {max_links_per_domain}")
|
||||||
|
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
|
||||||
|
|
||||||
|
# Perform recursive crawling
|
||||||
|
all_content = await self._crawlRecursively(
|
||||||
|
urls=selected_websites,
|
||||||
|
max_depth=effective_depth,
|
||||||
|
extract_depth=request.extract_depth,
|
||||||
|
max_per_domain=max_links_per_domain,
|
||||||
|
global_processed_urls=global_processed_urls
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
||||||
|
|
||||||
|
# Step 6: AI analysis of all collected content
|
||||||
|
logger.info(f"=== STEP 6: AI ANALYSIS ===")
|
||||||
|
logger.info(f"Analyzing {len(all_content)} websites with AI")
|
||||||
|
|
||||||
|
# Create a basic analysis result
|
||||||
|
analysis_result = f"Web research completed for: {request.user_prompt}\n\n"
|
||||||
|
analysis_result += f"Analyzed {len(all_content)} websites:\n"
|
||||||
|
for url, content in all_content.items():
|
||||||
|
analysis_result += f"- {url}: {len(content)} characters\n"
|
||||||
|
|
||||||
|
# Create result documents
|
||||||
|
import time
|
||||||
|
result_documents = []
|
||||||
|
|
||||||
|
# Main research result
|
||||||
|
main_document = {
|
||||||
|
"documentName": f"web_research_{int(time.time())}.json",
|
||||||
|
"documentData": {
|
||||||
|
"user_prompt": request.user_prompt,
|
||||||
|
"websites_analyzed": len(all_content),
|
||||||
|
"additional_links_found": 0, # Would be calculated from crawl results
|
||||||
|
"analysis_result": analysis_result,
|
||||||
|
"sources": [{"title": f"Website {i+1}", "url": url} for i, url in enumerate(all_content.keys())],
|
||||||
|
"additional_links": [],
|
||||||
|
"debug_info": {
|
||||||
|
"total_urls_processed": len(global_processed_urls),
|
||||||
|
"crawl_depth": effective_depth,
|
||||||
|
"extract_depth": request.extract_depth
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mimeType": "application/json"
|
||||||
|
}
|
||||||
|
result_documents.append(main_document)
|
||||||
|
|
||||||
|
# Individual website content documents
|
||||||
|
for i, (url, content) in enumerate(all_content.items()):
|
||||||
|
content_document = {
|
||||||
|
"documentName": f"website_content_{i+1}.md",
|
||||||
|
"documentData": content,
|
||||||
|
"mimeType": "text/markdown"
|
||||||
|
}
|
||||||
|
result_documents.append(content_document)
|
||||||
|
|
||||||
|
logger.info(f"WEB RESEARCH COMPLETED SUCCESSFULLY")
|
||||||
|
logger.info(f"Generated {len(result_documents)} result documents")
|
||||||
|
|
||||||
|
return WebResearchResult(
|
||||||
|
success=True,
|
||||||
|
documents=result_documents
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in comprehensive web research: {str(e)}")
|
||||||
|
return WebResearchResult(
|
||||||
|
success=False,
|
||||||
|
error=str(e),
|
||||||
|
documents=[]
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Recursively crawl URLs up to specified depth.
|
||||||
|
This is a simplified version of the recursive crawling logic.
|
||||||
|
"""
|
||||||
|
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
|
||||||
|
|
||||||
|
# URL index to track all processed URLs (local + global)
|
||||||
|
processed_urls = set()
|
||||||
|
if global_processed_urls is not None:
|
||||||
|
processed_urls = global_processed_urls
|
||||||
|
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
|
||||||
|
else:
|
||||||
|
logger.info("Using local URL index for this crawl session")
|
||||||
|
|
||||||
|
all_content = {}
|
||||||
|
current_level_urls = urls.copy()
|
||||||
|
|
||||||
|
try:
|
||||||
|
for depth in range(1, max_depth + 1):
|
||||||
|
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
|
||||||
|
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
|
||||||
|
|
||||||
|
# URLs found at this level (for next iteration)
|
||||||
|
next_level_urls = []
|
||||||
|
|
||||||
|
for url in current_level_urls:
|
||||||
|
# Normalize URL for duplicate checking
|
||||||
|
normalized_url = self._normalizeUrl(url)
|
||||||
|
if normalized_url in processed_urls:
|
||||||
|
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"Processing URL at depth {depth}: {url}")
|
||||||
|
|
||||||
|
# Extract content from URL
|
||||||
|
crawl_results = await self._crawl([url], extract_depth=extract_depth, format="markdown")
|
||||||
|
|
||||||
|
if crawl_results and crawl_results[0].content:
|
||||||
|
content = crawl_results[0].content
|
||||||
|
all_content[url] = content
|
||||||
|
processed_urls.add(normalized_url)
|
||||||
|
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
|
||||||
|
|
||||||
|
# For simplicity, we'll skip finding sub-links in this implementation
|
||||||
|
# In a full implementation, you would extract links and add them to next_level_urls
|
||||||
|
|
||||||
|
else:
|
||||||
|
logger.warning(f"✗ No content extracted from {url}")
|
||||||
|
processed_urls.add(normalized_url)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
|
||||||
|
processed_urls.add(normalized_url)
|
||||||
|
|
||||||
|
# Prepare for next iteration
|
||||||
|
current_level_urls = next_level_urls
|
||||||
|
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
|
||||||
|
|
||||||
|
# Stop if no more URLs to process
|
||||||
|
if not current_level_urls:
|
||||||
|
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
|
||||||
|
break
|
||||||
|
|
||||||
|
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
||||||
|
return all_content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
|
||||||
|
return all_content
|
||||||
|
|
||||||
|
def _normalizeUrl(self, url: str) -> str:
|
||||||
|
"""Normalize URL to handle variations that should be considered duplicates."""
|
||||||
|
if not url:
|
||||||
|
return url
|
||||||
|
|
||||||
|
# Remove trailing slashes and fragments
|
||||||
|
url = url.rstrip('/')
|
||||||
|
if '#' in url:
|
||||||
|
url = url.split('#')[0]
|
||||||
|
|
||||||
|
# Handle common URL variations
|
||||||
|
url = url.replace('http://', 'https://') # Normalize protocol
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
|
||||||
|
|
@ -649,427 +649,6 @@ class AiObjects:
|
||||||
errorCount=1
|
errorCount=1
|
||||||
)
|
)
|
||||||
|
|
||||||
# Web functionality methods - Now use standardized AiModelCall/AiModelResponse pattern
|
|
||||||
async def searchWebsites(self, query: str, maxResults: int = 5, **kwargs) -> str:
|
|
||||||
"""Search for websites using Tavily with standardized pattern."""
|
|
||||||
from modules.datamodels.datamodelAi import AiModelCall
|
|
||||||
|
|
||||||
modelCall = AiModelCall(
|
|
||||||
messages=[{"role": "user", "content": query}],
|
|
||||||
options={"max_results": maxResults, **kwargs}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get Tavily connector from registry
|
|
||||||
tavilyConnector = modelRegistry.getConnectorForModel("tavily_search")
|
|
||||||
if not tavilyConnector:
|
|
||||||
raise ValueError("Tavily connector not available")
|
|
||||||
|
|
||||||
result = await tavilyConnector.search(modelCall)
|
|
||||||
return result.content if result.success else ""
|
|
||||||
|
|
||||||
async def crawlWebsites(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> str:
|
|
||||||
"""Crawl websites using Tavily with standardized pattern."""
|
|
||||||
from modules.datamodels.datamodelAi import AiModelCall
|
|
||||||
|
|
||||||
modelCall = AiModelCall(
|
|
||||||
messages=[{"role": "user", "content": "crawl websites"}],
|
|
||||||
options={"urls": urls, "extract_depth": extractDepth, "format": format}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get Tavily connector from registry
|
|
||||||
tavilyConnector = modelRegistry.getConnectorForModel("tavily_crawl")
|
|
||||||
if not tavilyConnector:
|
|
||||||
raise ValueError("Tavily connector not available")
|
|
||||||
|
|
||||||
result = await tavilyConnector.crawl(modelCall)
|
|
||||||
return result.content if result.success else ""
|
|
||||||
|
|
||||||
async def extractContent(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> Dict[str, str]:
|
|
||||||
"""Extract content from URLs and return as dictionary."""
|
|
||||||
import json
|
|
||||||
crawlResults = await self.crawlWebsites(urls, extractDepth, format)
|
|
||||||
|
|
||||||
# Parse JSON response and extract content
|
|
||||||
try:
|
|
||||||
data = json.loads(crawlResults)
|
|
||||||
return {result["url"]: result["content"] for result in data.get("results", [])}
|
|
||||||
except (json.JSONDecodeError, KeyError):
|
|
||||||
return {}
|
|
||||||
|
|
||||||
# Core Web Tools - Clean interface for web operations
|
|
||||||
async def readPage(self, url: str, extractDepth: str = "advanced") -> Optional[str]:
|
|
||||||
"""Read a single web page and return its content (HTML/Markdown)."""
|
|
||||||
logger.debug(f"Reading page: {url}")
|
|
||||||
try:
|
|
||||||
# URL encode the URL to handle spaces and special characters
|
|
||||||
from urllib.parse import quote, urlparse, urlunparse
|
|
||||||
parsed = urlparse(url)
|
|
||||||
encodedUrl = urlunparse((
|
|
||||||
parsed.scheme,
|
|
||||||
parsed.netloc,
|
|
||||||
parsed.path,
|
|
||||||
parsed.params,
|
|
||||||
parsed.query,
|
|
||||||
parsed.fragment
|
|
||||||
))
|
|
||||||
|
|
||||||
# Manually encode query parameters to handle spaces
|
|
||||||
if parsed.query:
|
|
||||||
encodedQuery = quote(parsed.query, safe='=&')
|
|
||||||
encodedUrl = urlunparse((
|
|
||||||
parsed.scheme,
|
|
||||||
parsed.netloc,
|
|
||||||
parsed.path,
|
|
||||||
parsed.params,
|
|
||||||
encodedQuery,
|
|
||||||
parsed.fragment
|
|
||||||
))
|
|
||||||
|
|
||||||
logger.debug(f"URL encoded: {url} -> {encodedUrl}")
|
|
||||||
|
|
||||||
content = await self.extractContent([encodedUrl], extractDepth, "markdown")
|
|
||||||
result = content.get(encodedUrl)
|
|
||||||
if result:
|
|
||||||
logger.debug(f"Successfully read page {encodedUrl}: {len(result)} chars")
|
|
||||||
else:
|
|
||||||
logger.warning(f"No content returned for page {encodedUrl}")
|
|
||||||
return result
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to read page {url}: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def getUrlsFromPage(self, url: str, extractDepth: str = "advanced") -> List[str]:
|
|
||||||
"""Get all URLs from a web page, with redundancies removed."""
|
|
||||||
try:
|
|
||||||
content = await self.readPage(url, extractDepth)
|
|
||||||
if not content:
|
|
||||||
return []
|
|
||||||
|
|
||||||
links = self._extractLinksFromContent(content, url)
|
|
||||||
# Remove duplicates while preserving order
|
|
||||||
seen = set()
|
|
||||||
uniqueLinks = []
|
|
||||||
for link in links:
|
|
||||||
if link not in seen:
|
|
||||||
seen.add(link)
|
|
||||||
uniqueLinks.append(link)
|
|
||||||
|
|
||||||
logger.debug(f"Extracted {len(uniqueLinks)} unique URLs from {url}")
|
|
||||||
return uniqueLinks
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to get URLs from page {url}: {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def filterUrlsOnlyPages(self, urls: List[str], maxPerDomain: int = 10) -> List[str]:
|
|
||||||
"""Filter URLs to get only links for pages to follow (no images, etc.)."""
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
def _isHtmlCandidate(url: str) -> bool:
|
|
||||||
lower = url.lower()
|
|
||||||
blocked = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp',
|
|
||||||
'.mp4', '.mp3', '.avi', '.mov', '.mkv',
|
|
||||||
'.pdf', '.zip', '.rar', '.7z', '.tar', '.gz',
|
|
||||||
'.css', '.js', '.woff', '.woff2', '.ttf', '.eot')
|
|
||||||
return not lower.endswith(blocked)
|
|
||||||
|
|
||||||
# Group by domain
|
|
||||||
domainLinks = {}
|
|
||||||
for link in urls:
|
|
||||||
domain = urlparse(link).netloc
|
|
||||||
if domain not in domainLinks:
|
|
||||||
domainLinks[domain] = []
|
|
||||||
domainLinks[domain].append(link)
|
|
||||||
|
|
||||||
# Filter and cap per domain
|
|
||||||
filteredLinks = []
|
|
||||||
for domain, domainLinkList in domainLinks.items():
|
|
||||||
seen = set()
|
|
||||||
domainFiltered = []
|
|
||||||
|
|
||||||
for link in domainLinkList:
|
|
||||||
if link in seen:
|
|
||||||
continue
|
|
||||||
if not _isHtmlCandidate(link):
|
|
||||||
continue
|
|
||||||
seen.add(link)
|
|
||||||
domainFiltered.append(link)
|
|
||||||
if len(domainFiltered) >= maxPerDomain:
|
|
||||||
break
|
|
||||||
|
|
||||||
filteredLinks.extend(domainFiltered)
|
|
||||||
logger.debug(f"Domain {domain}: {len(domainLinkList)} -> {len(domainFiltered)} links")
|
|
||||||
|
|
||||||
return filteredLinks
|
|
||||||
|
|
||||||
def _extractLinksFromContent(self, content: str, baseUrl: str) -> List[str]:
|
|
||||||
"""Extract links from HTML/Markdown content."""
|
|
||||||
try:
|
|
||||||
import re
|
|
||||||
from urllib.parse import urljoin, urlparse, quote, urlunparse
|
|
||||||
|
|
||||||
def _cleanUrl(url: str) -> str:
|
|
||||||
"""Clean and encode URL to remove spaces and invalid characters."""
|
|
||||||
# Remove quotes and extra spaces
|
|
||||||
url = url.strip().strip('"\'')
|
|
||||||
|
|
||||||
# If it's a relative URL, make it absolute first
|
|
||||||
if not url.startswith(('http://', 'https://')):
|
|
||||||
url = urljoin(baseUrl, url)
|
|
||||||
|
|
||||||
# Parse and re-encode the URL properly
|
|
||||||
parsed = urlparse(url)
|
|
||||||
if parsed.query:
|
|
||||||
# Encode query parameters properly
|
|
||||||
encodedQuery = quote(parsed.query, safe='=&')
|
|
||||||
url = urlunparse((
|
|
||||||
parsed.scheme,
|
|
||||||
parsed.netloc,
|
|
||||||
parsed.path,
|
|
||||||
parsed.params,
|
|
||||||
encodedQuery,
|
|
||||||
parsed.fragment
|
|
||||||
))
|
|
||||||
|
|
||||||
return url
|
|
||||||
|
|
||||||
links = []
|
|
||||||
|
|
||||||
# Extract HTML links: <a href="url"> format
|
|
||||||
htmlLinkPattern = r'<a[^>]+href=["\']([^"\']+)["\'][^>]*>'
|
|
||||||
htmlLinks = re.findall(htmlLinkPattern, content, re.IGNORECASE)
|
|
||||||
|
|
||||||
for url in htmlLinks:
|
|
||||||
if url and not url.startswith('#') and not url.startswith('javascript:'):
|
|
||||||
try:
|
|
||||||
cleanedUrl = _cleanUrl(url)
|
|
||||||
links.append(cleanedUrl)
|
|
||||||
logger.debug(f"Extracted HTML link: {url} -> {cleanedUrl}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Failed to clean HTML link {url}: {e}")
|
|
||||||
|
|
||||||
# Extract markdown links: [text](url) format
|
|
||||||
markdownLinkPattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
|
||||||
markdownLinks = re.findall(markdownLinkPattern, content)
|
|
||||||
|
|
||||||
for text, url in markdownLinks:
|
|
||||||
if url and not url.startswith('#'):
|
|
||||||
try:
|
|
||||||
cleanedUrl = _cleanUrl(url)
|
|
||||||
# Only keep URLs from the same domain
|
|
||||||
if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc:
|
|
||||||
links.append(cleanedUrl)
|
|
||||||
logger.debug(f"Extracted markdown link: {url} -> {cleanedUrl}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Failed to clean markdown link {url}: {e}")
|
|
||||||
|
|
||||||
# Extract plain URLs in the text
|
|
||||||
urlPattern = r'https?://[^\s\)]+'
|
|
||||||
plainUrls = re.findall(urlPattern, content)
|
|
||||||
|
|
||||||
for url in plainUrls:
|
|
||||||
try:
|
|
||||||
cleanUrl = url.rstrip('.,;!?')
|
|
||||||
cleanedUrl = _cleanUrl(cleanUrl)
|
|
||||||
if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc:
|
|
||||||
if cleanedUrl not in links: # Avoid duplicates
|
|
||||||
links.append(cleanedUrl)
|
|
||||||
logger.debug(f"Extracted plain URL: {url} -> {cleanedUrl}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Failed to clean plain URL {url}: {e}")
|
|
||||||
|
|
||||||
logger.debug(f"Total links extracted and cleaned: {len(links)}")
|
|
||||||
return links
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to extract links from content: {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _normalizeUrl(self, url: str) -> str:
|
|
||||||
"""Normalize URL to handle variations that should be considered duplicates."""
|
|
||||||
if not url:
|
|
||||||
return url
|
|
||||||
|
|
||||||
# Remove trailing slashes and fragments
|
|
||||||
url = url.rstrip('/')
|
|
||||||
if '#' in url:
|
|
||||||
url = url.split('#')[0]
|
|
||||||
|
|
||||||
# Handle common URL variations
|
|
||||||
url = url.replace('http://', 'https://') # Normalize protocol
|
|
||||||
|
|
||||||
return url
|
|
||||||
|
|
||||||
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
|
|
||||||
"""
|
|
||||||
Recursively crawl URLs up to specified depth.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
urls: List of starting URLs to crawl
|
|
||||||
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
|
|
||||||
extract_depth: Tavily extract depth setting
|
|
||||||
max_per_domain: Maximum URLs per domain per level
|
|
||||||
global_processed_urls: Optional global set to track processed URLs across sessions
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary mapping URL -> content for all crawled pages
|
|
||||||
"""
|
|
||||||
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
|
|
||||||
|
|
||||||
# URL index to track all processed URLs (local + global)
|
|
||||||
processed_urls = set()
|
|
||||||
if global_processed_urls is not None:
|
|
||||||
# Use global index if provided, otherwise create local one
|
|
||||||
processed_urls = global_processed_urls
|
|
||||||
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
|
|
||||||
else:
|
|
||||||
logger.info("Using local URL index for this crawl session")
|
|
||||||
|
|
||||||
all_content = {}
|
|
||||||
|
|
||||||
# Current level URLs to process
|
|
||||||
current_level_urls = urls.copy()
|
|
||||||
|
|
||||||
try:
|
|
||||||
for depth in range(1, max_depth + 1):
|
|
||||||
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
|
|
||||||
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
|
|
||||||
|
|
||||||
# URLs found at this level (for next iteration)
|
|
||||||
next_level_urls = []
|
|
||||||
|
|
||||||
for url in current_level_urls:
|
|
||||||
# Normalize URL for duplicate checking
|
|
||||||
normalized_url = self._normalizeUrl(url)
|
|
||||||
if normalized_url in processed_urls:
|
|
||||||
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
logger.info(f"Processing URL at depth {depth}: {url}")
|
|
||||||
logger.debug(f"Total processed URLs so far: {len(processed_urls)}")
|
|
||||||
|
|
||||||
# Read page content
|
|
||||||
content = await self.readPage(url, extract_depth)
|
|
||||||
if content:
|
|
||||||
all_content[url] = content
|
|
||||||
processed_urls.add(normalized_url)
|
|
||||||
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
|
|
||||||
|
|
||||||
# Get URLs from this page for next level
|
|
||||||
page_urls = await self.getUrlsFromPage(url, extract_depth)
|
|
||||||
logger.info(f"Found {len(page_urls)} URLs on {url}")
|
|
||||||
|
|
||||||
# Filter URLs and add to next level
|
|
||||||
filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
|
|
||||||
logger.info(f"Filtered to {len(filtered_urls)} valid URLs")
|
|
||||||
|
|
||||||
# Add new URLs to next level (avoiding already processed ones)
|
|
||||||
new_urls_count = 0
|
|
||||||
for new_url in filtered_urls:
|
|
||||||
normalized_new_url = self._normalizeUrl(new_url)
|
|
||||||
if normalized_new_url not in processed_urls:
|
|
||||||
next_level_urls.append(new_url)
|
|
||||||
new_urls_count += 1
|
|
||||||
else:
|
|
||||||
logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping")
|
|
||||||
|
|
||||||
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
|
|
||||||
else:
|
|
||||||
logger.warning(f"✗ No content extracted from {url}")
|
|
||||||
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
|
|
||||||
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
|
||||||
|
|
||||||
# Prepare for next iteration
|
|
||||||
current_level_urls = next_level_urls
|
|
||||||
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
|
|
||||||
|
|
||||||
# Stop if no more URLs to process
|
|
||||||
if not current_level_urls:
|
|
||||||
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
|
|
||||||
break
|
|
||||||
|
|
||||||
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
|
||||||
logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}")
|
|
||||||
logger.info(f"Unique URLs found: {len(all_content)}")
|
|
||||||
return all_content
|
|
||||||
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far")
|
|
||||||
return all_content
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
|
|
||||||
return all_content
|
|
||||||
|
|
||||||
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> AiCallResponse:
|
|
||||||
"""Use Perplexity AI to provide the best answers for web-related queries."""
|
|
||||||
|
|
||||||
if options is None:
|
|
||||||
options = AiCallOptions(operationType=OperationTypeEnum.WEB_RESEARCH)
|
|
||||||
|
|
||||||
# Calculate input bytes
|
|
||||||
inputBytes = len((query + context).encode("utf-8"))
|
|
||||||
|
|
||||||
# Create a comprehensive prompt for web queries
|
|
||||||
webPrompt = f"""You are an expert web researcher and information analyst. Please provide a comprehensive and accurate answer to the following web-related query.
|
|
||||||
|
|
||||||
Query: {query}
|
|
||||||
|
|
||||||
{f"Additional Context: {context}" if context else ""}
|
|
||||||
|
|
||||||
Please provide:
|
|
||||||
1. A clear, well-structured answer to the query
|
|
||||||
2. Key points and important details
|
|
||||||
3. Relevant insights and analysis
|
|
||||||
4. Any important considerations or caveats
|
|
||||||
5. Suggestions for further research if applicable
|
|
||||||
|
|
||||||
Format your response in a clear, professional manner that would be helpful for someone researching this topic."""
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Start timing
|
|
||||||
startTime = time.time()
|
|
||||||
|
|
||||||
# Use Perplexity for web research with search capabilities
|
|
||||||
perplexity_connector = modelRegistry.getConnectorForModel("perplexity_callAiWithWebSearch")
|
|
||||||
if not perplexity_connector:
|
|
||||||
raise ValueError("Perplexity connector not available")
|
|
||||||
response = await perplexity_connector.callAiWithWebSearch(webPrompt)
|
|
||||||
|
|
||||||
# Calculate timing and output bytes
|
|
||||||
endTime = time.time()
|
|
||||||
processingTime = endTime - startTime
|
|
||||||
outputBytes = len(response.encode("utf-8"))
|
|
||||||
|
|
||||||
# Calculate price using Perplexity model pricing
|
|
||||||
perplexity_model = modelRegistry.getModel("perplexity_callAiWithWebSearch")
|
|
||||||
priceUsd = perplexity_model.calculatePriceUsd(inputBytes, outputBytes)
|
|
||||||
|
|
||||||
logger.info(f"✅ Web query successful with Perplexity")
|
|
||||||
return AiCallResponse(
|
|
||||||
content=response,
|
|
||||||
modelName="perplexity_callAiWithWebSearch",
|
|
||||||
priceUsd=priceUsd,
|
|
||||||
processingTime=processingTime,
|
|
||||||
bytesSent=inputBytes,
|
|
||||||
bytesReceived=outputBytes,
|
|
||||||
errorCount=0
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Perplexity web query failed: {str(e)}")
|
|
||||||
return AiCallResponse(
|
|
||||||
content=f"Web query failed: {str(e)}",
|
|
||||||
modelName="perplexity_callAiWithWebSearch",
|
|
||||||
priceUsd=0.0,
|
|
||||||
processingTime=0.0,
|
|
||||||
bytesSent=inputBytes,
|
|
||||||
bytesReceived=0,
|
|
||||||
errorCount=1
|
|
||||||
)
|
|
||||||
|
|
||||||
# Utility methods
|
# Utility methods
|
||||||
async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]:
|
async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]:
|
||||||
"""List available models, optionally filtered by connector type."""
|
"""List available models, optionally filtered by connector type."""
|
||||||
|
|
@ -1085,163 +664,8 @@ Format your response in a clear, professional manner that would be helpful for s
|
||||||
raise ValueError(f"Model {modelName} not found")
|
raise ValueError(f"Model {modelName} not found")
|
||||||
return model.model_dump()
|
return model.model_dump()
|
||||||
|
|
||||||
|
|
||||||
async def getModelsByTag(self, tag: str) -> List[str]:
|
async def getModelsByTag(self, tag: str) -> List[str]:
|
||||||
"""Get model names that have a specific tag."""
|
"""Get model names that have a specific tag."""
|
||||||
models = modelRegistry.getModelsByTag(tag)
|
models = modelRegistry.getModelsByTag(tag)
|
||||||
return [model.name for model in models]
|
return [model.name for model in models]
|
||||||
|
|
||||||
async def selectRelevantWebsites(self, websites: List[str], userQuestion: str) -> Tuple[List[str], str]:
|
|
||||||
"""Select most relevant websites using AI analysis. Returns (selected_websites, ai_response)."""
|
|
||||||
if len(websites) <= 1:
|
|
||||||
return websites, "Only one website available, no selection needed"
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Create website summaries for AI analysis
|
|
||||||
websiteSummaries = []
|
|
||||||
for i, url in enumerate(websites, 1):
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
domain = urlparse(url).netloc
|
|
||||||
summary = f"{i}. {url} (Domain: {domain})"
|
|
||||||
websiteSummaries.append(summary)
|
|
||||||
|
|
||||||
selectionPrompt = f"""
|
|
||||||
Based on this user request: "{userQuestion}"
|
|
||||||
|
|
||||||
I have {len(websites)} websites found. Please select the most relevant website(s) for this request.
|
|
||||||
|
|
||||||
Available websites:
|
|
||||||
{chr(10).join(websiteSummaries)}
|
|
||||||
|
|
||||||
Please respond with the website number(s) (1, 2, 3, etc.) that are most relevant.
|
|
||||||
Format: 1,3,5 (or just 1 for single selection)
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Use Perplexity to select the best websites
|
|
||||||
response = await self.webQuery(selectionPrompt)
|
|
||||||
|
|
||||||
# Parse the selection
|
|
||||||
import re
|
|
||||||
numbers = re.findall(r'\d+', response)
|
|
||||||
if numbers:
|
|
||||||
selectedWebsites = []
|
|
||||||
for num in numbers:
|
|
||||||
index = int(num) - 1
|
|
||||||
if 0 <= index < len(websites):
|
|
||||||
selectedWebsites.append(websites[index])
|
|
||||||
|
|
||||||
if selectedWebsites:
|
|
||||||
logger.info(f"AI selected {len(selectedWebsites)} websites")
|
|
||||||
return selectedWebsites, response
|
|
||||||
|
|
||||||
# Fallback to first website
|
|
||||||
logger.warning("AI selection failed, using first website")
|
|
||||||
return websites[:1], f"AI selection failed, fallback to first website. AI response: {response}"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in website selection: {str(e)}")
|
|
||||||
return websites[:1], f"Error in website selection: {str(e)}"
|
|
||||||
|
|
||||||
async def analyzeContentWithChunking(self, allContent: Dict[str, str], userQuestion: str) -> str:
|
|
||||||
"""Analyze content using AI with chunking for large content."""
|
|
||||||
logger.info(f"Analyzing {len(allContent)} websites with AI")
|
|
||||||
|
|
||||||
# Process content in chunks to avoid token limits
|
|
||||||
chunkSize = 50000 # 50k chars per chunk
|
|
||||||
allChunks = []
|
|
||||||
|
|
||||||
for url, content in allContent.items():
|
|
||||||
filteredContent = self._filterContent(content)
|
|
||||||
if len(filteredContent) <= chunkSize:
|
|
||||||
allChunks.append((url, filteredContent))
|
|
||||||
logger.info(f"Content from {url}: {len(filteredContent)} chars (single chunk)")
|
|
||||||
else:
|
|
||||||
# Split large content into chunks
|
|
||||||
chunkCount = (len(filteredContent) + chunkSize - 1) // chunkSize
|
|
||||||
logger.info(f"Content from {url}: {len(filteredContent)} chars (split into {chunkCount} chunks)")
|
|
||||||
for i in range(0, len(filteredContent), chunkSize):
|
|
||||||
chunk = filteredContent[i:i+chunkSize]
|
|
||||||
chunkNum = i//chunkSize + 1
|
|
||||||
allChunks.append((f"{url} (part {chunkNum})", chunk))
|
|
||||||
|
|
||||||
logger.info(f"Processing {len(allChunks)} content chunks")
|
|
||||||
|
|
||||||
# Analyze each chunk
|
|
||||||
chunkAnalyses = []
|
|
||||||
for i, (url, chunk) in enumerate(allChunks, 1):
|
|
||||||
logger.info(f"Analyzing chunk {i}/{len(allChunks)}: {url}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
analysisPrompt = f"""
|
|
||||||
Analyze this web content and extract relevant information for: {userQuestion}
|
|
||||||
|
|
||||||
Source: {url}
|
|
||||||
Content: {chunk}
|
|
||||||
|
|
||||||
Please extract key information relevant to the query.
|
|
||||||
"""
|
|
||||||
|
|
||||||
analysis = await self.webQuery(analysisPrompt)
|
|
||||||
chunkAnalyses.append(analysis)
|
|
||||||
logger.info(f"Chunk {i}/{len(allChunks)} analyzed successfully")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Chunk {i}/{len(allChunks)} error: {e}")
|
|
||||||
|
|
||||||
# Combine all chunk analyses
|
|
||||||
if chunkAnalyses:
|
|
||||||
logger.info(f"Combining {len(chunkAnalyses)} chunk analyses")
|
|
||||||
combinedAnalysis = "\n\n".join(chunkAnalyses)
|
|
||||||
|
|
||||||
# Final synthesis
|
|
||||||
try:
|
|
||||||
logger.info("Performing final synthesis of all analyses")
|
|
||||||
synthesisPrompt = f"""
|
|
||||||
Based on these partial analyses, provide a comprehensive answer to: {userQuestion}
|
|
||||||
|
|
||||||
Partial analyses:
|
|
||||||
{combinedAnalysis}
|
|
||||||
|
|
||||||
Please provide a clear, well-structured answer to the query.
|
|
||||||
"""
|
|
||||||
|
|
||||||
finalAnalysis = await self.webQuery(synthesisPrompt)
|
|
||||||
logger.info("Final synthesis completed successfully")
|
|
||||||
return finalAnalysis
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Synthesis error: {e}")
|
|
||||||
return combinedAnalysis
|
|
||||||
else:
|
|
||||||
logger.error("No content could be analyzed")
|
|
||||||
return "No content could be analyzed"
|
|
||||||
|
|
||||||
def _filterContent(self, content: str) -> str:
|
|
||||||
"""Filter out navigation, ads, and other nonsense content."""
|
|
||||||
lines = content.split('\n')
|
|
||||||
filteredLines = []
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
line = line.strip()
|
|
||||||
# Skip empty lines
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
# Skip navigation elements
|
|
||||||
if any(skip in line.lower() for skip in [
|
|
||||||
'toggle navigation', 'log in', 'sign up', 'cookies', 'privacy policy',
|
|
||||||
'terms of service', 'subscribe', 'newsletter', 'follow us', 'share this',
|
|
||||||
'advertisement', 'sponsored', 'banner', 'popup', 'modal'
|
|
||||||
]):
|
|
||||||
continue
|
|
||||||
# Skip image references without context
|
|
||||||
if line.startswith(' and line.endswith(')') and '---' in line:
|
|
||||||
continue
|
|
||||||
# Keep meaningful content
|
|
||||||
if len(line) > 10: # Skip very short lines
|
|
||||||
filteredLines.append(line)
|
|
||||||
|
|
||||||
return '\n'.join(filteredLines)
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,6 @@ from modules.aicore.aicorePluginTavily import WebResearchRequest, WebResearchRes
|
||||||
from modules.interfaces.interfaceAiObjects import AiObjects
|
from modules.interfaces.interfaceAiObjects import AiObjects
|
||||||
from modules.services.serviceAi.subCoreAi import SubCoreAi
|
from modules.services.serviceAi.subCoreAi import SubCoreAi
|
||||||
from modules.services.serviceAi.subDocumentProcessing import SubDocumentProcessing
|
from modules.services.serviceAi.subDocumentProcessing import SubDocumentProcessing
|
||||||
from modules.services.serviceAi.subWebResearch import SubWebResearch
|
|
||||||
from modules.services.serviceAi.subDocumentGeneration import SubDocumentGeneration
|
from modules.services.serviceAi.subDocumentGeneration import SubDocumentGeneration
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -19,7 +18,6 @@ class AiService:
|
||||||
Manager delegates to specialized sub-modules:
|
Manager delegates to specialized sub-modules:
|
||||||
- SubCoreAi: Core AI operations (readImage, generateImage, callAi, planning, text calls)
|
- SubCoreAi: Core AI operations (readImage, generateImage, callAi, planning, text calls)
|
||||||
- SubDocumentProcessing: Document chunking, processing, and merging logic
|
- SubDocumentProcessing: Document chunking, processing, and merging logic
|
||||||
- SubWebResearch: Web research and crawling functionality
|
|
||||||
- SubDocumentGeneration: Single-file and multi-file document generation
|
- SubDocumentGeneration: Single-file and multi-file document generation
|
||||||
|
|
||||||
The main service acts as a coordinator:
|
The main service acts as a coordinator:
|
||||||
|
|
@ -40,7 +38,6 @@ class AiService:
|
||||||
self._extractionService = None # Lazy initialization
|
self._extractionService = None # Lazy initialization
|
||||||
self._coreAi = None # Lazy initialization
|
self._coreAi = None # Lazy initialization
|
||||||
self._documentProcessor = None # Lazy initialization
|
self._documentProcessor = None # Lazy initialization
|
||||||
self._webResearch = None # Lazy initialization
|
|
||||||
self._documentGenerator = None # Lazy initialization
|
self._documentGenerator = None # Lazy initialization
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
@ -69,13 +66,6 @@ class AiService:
|
||||||
self._documentProcessor = SubDocumentProcessing(self.services, self.aiObjects)
|
self._documentProcessor = SubDocumentProcessing(self.services, self.aiObjects)
|
||||||
return self._documentProcessor
|
return self._documentProcessor
|
||||||
|
|
||||||
@property
|
|
||||||
def webResearchService(self):
|
|
||||||
"""Lazy initialization of web research service."""
|
|
||||||
if self._webResearch is None:
|
|
||||||
logger.info("Lazy initializing SubWebResearch...")
|
|
||||||
self._webResearch = SubWebResearch(self.services, self.aiObjects)
|
|
||||||
return self._webResearch
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def documentGenerator(self):
|
def documentGenerator(self):
|
||||||
|
|
@ -127,11 +117,6 @@ class AiService:
|
||||||
await self._ensureAiObjectsInitialized()
|
await self._ensureAiObjectsInitialized()
|
||||||
return await self.coreAi.generateImage(prompt, size, quality, style, options)
|
return await self.coreAi.generateImage(prompt, size, quality, style, options)
|
||||||
|
|
||||||
# Web Research
|
|
||||||
async def webResearch(self, request: WebResearchRequest) -> WebResearchResult:
|
|
||||||
"""Perform web research using interface functions."""
|
|
||||||
await self._ensureAiObjectsInitialized()
|
|
||||||
return await self.webResearchService.webResearch(request)
|
|
||||||
|
|
||||||
# Core AI Methods - Delegating to SubCoreAi
|
# Core AI Methods - Delegating to SubCoreAi
|
||||||
async def callAiPlanning(
|
async def callAiPlanning(
|
||||||
|
|
|
||||||
|
|
@ -1,388 +0,0 @@
|
||||||
import logging
|
|
||||||
from typing import Optional
|
|
||||||
from modules.aicore.aicorePluginTavily import WebResearchRequest, WebResearchResult
|
|
||||||
from modules.shared.configuration import APP_CONFIG
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class SubWebResearch:
|
|
||||||
"""Web research operations including search, crawling, and analysis."""
|
|
||||||
|
|
||||||
def __init__(self, services, aiObjects):
|
|
||||||
"""Initialize web research service.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
services: Service center instance for accessing other services
|
|
||||||
aiObjects: Initialized AiObjects instance
|
|
||||||
"""
|
|
||||||
self.services = services
|
|
||||||
self.aiObjects = aiObjects
|
|
||||||
|
|
||||||
async def webResearch(self, request: WebResearchRequest) -> WebResearchResult:
|
|
||||||
"""Perform web research using interface functions."""
|
|
||||||
try:
|
|
||||||
logger.info(f"WEB RESEARCH STARTED")
|
|
||||||
logger.info(f"User Query: {request.user_prompt}")
|
|
||||||
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}")
|
|
||||||
|
|
||||||
# Global URL index to track all processed URLs across the entire research session
|
|
||||||
global_processed_urls = set()
|
|
||||||
|
|
||||||
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
|
|
||||||
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
|
|
||||||
|
|
||||||
if request.urls:
|
|
||||||
# Use provided URLs as initial main URLs
|
|
||||||
websites = request.urls
|
|
||||||
logger.info(f"Using provided URLs ({len(websites)}):")
|
|
||||||
for i, url in enumerate(websites, 1):
|
|
||||||
logger.info(f" {i}. {url}")
|
|
||||||
else:
|
|
||||||
# Use AI to determine main URLs based on user's intention
|
|
||||||
logger.info(f"AI analyzing user intent: '{request.user_prompt}'")
|
|
||||||
|
|
||||||
# Use AI to generate optimized Tavily search query and search parameters
|
|
||||||
query_optimizer_prompt = f"""You are a search query optimizer.
|
|
||||||
|
|
||||||
USER QUERY: {request.user_prompt}
|
|
||||||
|
|
||||||
Your task: Create a search query and parameters for the USER QUERY given.
|
|
||||||
|
|
||||||
RULES:
|
|
||||||
1. The search query MUST be related to the user query above
|
|
||||||
2. Extract key terms from the user query
|
|
||||||
3. Determine appropriate country/language based on the query context
|
|
||||||
4. Keep search query short (2-6 words)
|
|
||||||
|
|
||||||
Return ONLY this JSON format:
|
|
||||||
{{
|
|
||||||
"user_prompt": "search query based on user query above",
|
|
||||||
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
|
|
||||||
"language": "language_code_or_null",
|
|
||||||
"topic": "general|news|academic_or_null",
|
|
||||||
"time_range": "d|w|m|y_or_null",
|
|
||||||
"selection_strategy": "single|multiple|specific_page",
|
|
||||||
"selection_criteria": "what URLs to prioritize",
|
|
||||||
"expected_url_patterns": ["pattern1", "pattern2"],
|
|
||||||
"estimated_result_count": number
|
|
||||||
}}"""
|
|
||||||
|
|
||||||
# Get AI response for query optimization
|
|
||||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
|
|
||||||
ai_request = AiCallRequest(
|
|
||||||
prompt=query_optimizer_prompt,
|
|
||||||
options=AiCallOptions()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Write web research query optimization prompt to debug file
|
|
||||||
self.services.utils.writeDebugFile(query_optimizer_prompt, "web_research_query_optimizer_prompt")
|
|
||||||
|
|
||||||
ai_response_obj = await self.aiObjects.call(ai_request)
|
|
||||||
ai_response = ai_response_obj.content
|
|
||||||
|
|
||||||
# Write web research query optimization response to debug file
|
|
||||||
self.services.utils.writeDebugFile(ai_response, "web_research_query_optimizer_response")
|
|
||||||
logger.debug(f"AI query optimizer response: {ai_response}")
|
|
||||||
|
|
||||||
# Parse AI response to extract search query
|
|
||||||
import json
|
|
||||||
try:
|
|
||||||
# Clean the response by removing markdown code blocks
|
|
||||||
cleaned_response = ai_response.strip()
|
|
||||||
if cleaned_response.startswith('```json'):
|
|
||||||
cleaned_response = cleaned_response[7:] # Remove ```json
|
|
||||||
if cleaned_response.endswith('```'):
|
|
||||||
cleaned_response = cleaned_response[:-3] # Remove ```
|
|
||||||
cleaned_response = cleaned_response.strip()
|
|
||||||
|
|
||||||
query_data = json.loads(cleaned_response)
|
|
||||||
search_query = query_data.get("user_prompt", request.user_prompt)
|
|
||||||
ai_country = query_data.get("country")
|
|
||||||
ai_language = query_data.get("language")
|
|
||||||
ai_topic = query_data.get("topic")
|
|
||||||
ai_time_range = query_data.get("time_range")
|
|
||||||
selection_strategy = query_data.get("selection_strategy", "multiple")
|
|
||||||
selection_criteria = query_data.get("selection_criteria", "relevant URLs")
|
|
||||||
expected_patterns = query_data.get("expected_url_patterns", [])
|
|
||||||
estimated_count = query_data.get("estimated_result_count", request.max_results)
|
|
||||||
|
|
||||||
logger.info(f"AI optimized search query: '{search_query}'")
|
|
||||||
logger.info(f"Selection strategy: {selection_strategy}")
|
|
||||||
logger.info(f"Selection criteria: {selection_criteria}")
|
|
||||||
logger.info(f"Expected URL patterns: {expected_patterns}")
|
|
||||||
logger.info(f"Estimated result count: {estimated_count}")
|
|
||||||
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
logger.warning("Failed to parse AI response as JSON, using original query")
|
|
||||||
search_query = request.user_prompt
|
|
||||||
ai_country = None
|
|
||||||
ai_language = None
|
|
||||||
ai_topic = None
|
|
||||||
ai_time_range = None
|
|
||||||
selection_strategy = "multiple"
|
|
||||||
|
|
||||||
# Perform the web search with AI-determined parameters
|
|
||||||
search_kwargs = {
|
|
||||||
"query": search_query,
|
|
||||||
"max_results": request.max_results,
|
|
||||||
"search_depth": request.options.search_depth,
|
|
||||||
"auto_parameters": False # Use explicit parameters
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add parameters only if they have valid values
|
|
||||||
def _normalizeCountry(c: Optional[str]) -> Optional[str]:
|
|
||||||
if not c:
|
|
||||||
return None
|
|
||||||
s = str(c).strip()
|
|
||||||
if not s or s.lower() in ['null', 'none', 'undefined']:
|
|
||||||
return None
|
|
||||||
# Map common codes to full English names when easy to do without extra deps
|
|
||||||
mapping = {
|
|
||||||
'ch': 'Switzerland', 'che': 'Switzerland',
|
|
||||||
'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany',
|
|
||||||
'at': 'Austria', 'aut': 'Austria',
|
|
||||||
'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States',
|
|
||||||
'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom'
|
|
||||||
}
|
|
||||||
key = s.lower()
|
|
||||||
if key in mapping:
|
|
||||||
return mapping[key]
|
|
||||||
# If looks like full name, capitalize first letter only (Tavily accepts English names)
|
|
||||||
return s
|
|
||||||
|
|
||||||
norm_ai_country = _normalizeCountry(ai_country)
|
|
||||||
norm_req_country = _normalizeCountry(request.options.country)
|
|
||||||
if norm_ai_country:
|
|
||||||
search_kwargs["country"] = norm_ai_country
|
|
||||||
elif norm_req_country:
|
|
||||||
search_kwargs["country"] = norm_req_country
|
|
||||||
|
|
||||||
if ai_language and ai_language not in ['null', '', 'none', 'undefined']:
|
|
||||||
search_kwargs["language"] = ai_language
|
|
||||||
elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']:
|
|
||||||
search_kwargs["language"] = request.options.language
|
|
||||||
|
|
||||||
if ai_topic and ai_topic in ['general', 'news', 'academic']:
|
|
||||||
search_kwargs["topic"] = ai_topic
|
|
||||||
elif request.options.topic and request.options.topic in ['general', 'news', 'academic']:
|
|
||||||
search_kwargs["topic"] = request.options.topic
|
|
||||||
|
|
||||||
if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']:
|
|
||||||
search_kwargs["time_range"] = ai_time_range
|
|
||||||
elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']:
|
|
||||||
search_kwargs["time_range"] = request.options.time_range
|
|
||||||
|
|
||||||
# Constrain by expected domains if provided by AI
|
|
||||||
try:
|
|
||||||
include_domains = []
|
|
||||||
for p in expected_patterns or []:
|
|
||||||
if not isinstance(p, str):
|
|
||||||
continue
|
|
||||||
# Extract bare domain from pattern or URL
|
|
||||||
import re
|
|
||||||
m = re.search(r"(?:https?://)?([^/\s]+)", p.strip())
|
|
||||||
if m:
|
|
||||||
domain = m.group(1).lower()
|
|
||||||
# strip leading www.
|
|
||||||
if domain.startswith('www.'):
|
|
||||||
domain = domain[4:]
|
|
||||||
include_domains.append(domain)
|
|
||||||
# Deduplicate
|
|
||||||
if include_domains:
|
|
||||||
seen = set()
|
|
||||||
uniq = []
|
|
||||||
for d in include_domains:
|
|
||||||
if d not in seen:
|
|
||||||
seen.add(d)
|
|
||||||
uniq.append(d)
|
|
||||||
search_kwargs["include_domains"] = uniq
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Log the parameters being used
|
|
||||||
logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}")
|
|
||||||
|
|
||||||
search_results = await self.aiObjects.search_websites(**search_kwargs)
|
|
||||||
|
|
||||||
logger.debug(f"Web search returned {len(search_results)} results:")
|
|
||||||
for i, result in enumerate(search_results, 1):
|
|
||||||
logger.debug(f" {i}. {result.url} - {result.title}")
|
|
||||||
|
|
||||||
# Deduplicate while preserving order
|
|
||||||
seen = set()
|
|
||||||
search_urls = []
|
|
||||||
for r in search_results:
|
|
||||||
u = str(r.url)
|
|
||||||
if u not in seen:
|
|
||||||
seen.add(u)
|
|
||||||
search_urls.append(u)
|
|
||||||
|
|
||||||
logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results")
|
|
||||||
|
|
||||||
if not search_urls:
|
|
||||||
logger.error("No relevant websites found")
|
|
||||||
return WebResearchResult(success=False, error="No relevant websites found")
|
|
||||||
|
|
||||||
# Now use AI to determine the main URLs based on user's intention
|
|
||||||
logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent")
|
|
||||||
|
|
||||||
# Create a prompt for AI to identify main URLs based on user's intention
|
|
||||||
ai_prompt = f"""
|
|
||||||
Select the most relevant URLs from these search results:
|
|
||||||
|
|
||||||
{chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])}
|
|
||||||
|
|
||||||
Return only the URLs that are most relevant for the user's query.
|
|
||||||
One URL per line.
|
|
||||||
"""
|
|
||||||
# Create AI call request
|
|
||||||
ai_request = AiCallRequest(
|
|
||||||
prompt=ai_prompt,
|
|
||||||
options=AiCallOptions()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Write web research URL selection prompt to debug file
|
|
||||||
self.services.utils.writeDebugFile(ai_prompt, "web_research_url_selection_prompt")
|
|
||||||
|
|
||||||
ai_response_obj = await self.aiObjects.call(ai_request)
|
|
||||||
ai_response = ai_response_obj.content
|
|
||||||
|
|
||||||
# Write web research URL selection response to debug file
|
|
||||||
self.services.utils.writeDebugFile(ai_response, "web_research_url_selection_response")
|
|
||||||
logger.debug(f"AI response for main URL selection: {ai_response}")
|
|
||||||
|
|
||||||
# Parse AI response to extract URLs
|
|
||||||
websites = []
|
|
||||||
for line in ai_response.strip().split('\n'):
|
|
||||||
line = line.strip()
|
|
||||||
if line and ('http://' in line or 'https://' in line):
|
|
||||||
# Extract URL from the line
|
|
||||||
for word in line.split():
|
|
||||||
if word.startswith('http://') or word.startswith('https://'):
|
|
||||||
websites.append(word.rstrip('.,;'))
|
|
||||||
break
|
|
||||||
|
|
||||||
if not websites:
|
|
||||||
logger.warning("AI did not identify any main URLs, using first few search results")
|
|
||||||
websites = search_urls[:3] # Fallback to first 3 search results
|
|
||||||
|
|
||||||
# Deduplicate while preserving order
|
|
||||||
seen = set()
|
|
||||||
unique_websites = []
|
|
||||||
for url in websites:
|
|
||||||
if url not in seen:
|
|
||||||
seen.add(url)
|
|
||||||
unique_websites.append(url)
|
|
||||||
|
|
||||||
websites = unique_websites
|
|
||||||
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs")
|
|
||||||
|
|
||||||
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
|
|
||||||
for i, url in enumerate(websites, 1):
|
|
||||||
logger.info(f" {i}. {url}")
|
|
||||||
|
|
||||||
# Step 2: Smart website selection using AI interface
|
|
||||||
logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
|
|
||||||
logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")
|
|
||||||
|
|
||||||
selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt)
|
|
||||||
|
|
||||||
logger.debug(f"AI Response: {aiResponse}")
|
|
||||||
logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:")
|
|
||||||
for i, url in enumerate(selectedWebsites, 1):
|
|
||||||
logger.debug(f" {i}. {url}")
|
|
||||||
|
|
||||||
# Show which were filtered out
|
|
||||||
filtered_out = [url for url in websites if url not in selectedWebsites]
|
|
||||||
if filtered_out:
|
|
||||||
logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:")
|
|
||||||
for i, url in enumerate(filtered_out, 1):
|
|
||||||
logger.debug(f" {i}. {url}")
|
|
||||||
|
|
||||||
# Step 3+4+5: Recursive crawling with configurable depth
|
|
||||||
# Get configuration parameters
|
|
||||||
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
|
|
||||||
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
|
|
||||||
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
|
|
||||||
crawl_timeout_seconds = crawl_timeout_minutes * 60
|
|
||||||
|
|
||||||
# Use the configured max_depth or the request's pages_search_depth, whichever is smaller
|
|
||||||
effective_depth = min(max_depth, request.options.pages_search_depth)
|
|
||||||
|
|
||||||
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===")
|
|
||||||
logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...")
|
|
||||||
logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})")
|
|
||||||
logger.info(f"Max links per domain: {max_links_per_domain}")
|
|
||||||
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
|
|
||||||
|
|
||||||
# Use recursive crawling with URL index to avoid duplicates
|
|
||||||
import asyncio
|
|
||||||
try:
|
|
||||||
allContent = await asyncio.wait_for(
|
|
||||||
self.aiObjects.crawlRecursively(
|
|
||||||
urls=selectedWebsites,
|
|
||||||
max_depth=effective_depth,
|
|
||||||
extract_depth=request.options.extract_depth,
|
|
||||||
max_per_domain=max_links_per_domain,
|
|
||||||
global_processed_urls=global_processed_urls
|
|
||||||
),
|
|
||||||
timeout=crawl_timeout_seconds
|
|
||||||
)
|
|
||||||
logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled")
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results")
|
|
||||||
# crawlRecursively now handles timeouts gracefully and returns partial results
|
|
||||||
# Try to get the partial results that were collected
|
|
||||||
allContent = {}
|
|
||||||
|
|
||||||
if not allContent:
|
|
||||||
logger.error("Could not extract content from any websites")
|
|
||||||
return WebResearchResult(success=False, error="Could not extract content from any websites")
|
|
||||||
|
|
||||||
logger.info(f"=== WEB RESEARCH COMPLETED ===")
|
|
||||||
logger.info(f"Successfully crawled {len(allContent)} URLs total")
|
|
||||||
logger.info(f"Crawl depth: {effective_depth} levels")
|
|
||||||
|
|
||||||
# Create simple result with raw content
|
|
||||||
sources = [{"title": url, "url": url} for url in selectedWebsites]
|
|
||||||
|
|
||||||
# Get all additional links (all URLs except main ones)
|
|
||||||
additional_links = [url for url in allContent.keys() if url not in selectedWebsites]
|
|
||||||
|
|
||||||
# Combine all content into a single result
|
|
||||||
combinedContent = ""
|
|
||||||
for url, content in allContent.items():
|
|
||||||
combinedContent += f"\n\n=== {url} ===\n{content}\n"
|
|
||||||
|
|
||||||
# Create simplified document structure
|
|
||||||
document = {
|
|
||||||
"documentName": f"webResearch_{request.user_prompt[:50]}.json",
|
|
||||||
"documentData": {
|
|
||||||
"user_prompt": request.user_prompt,
|
|
||||||
"analysis_result": combinedContent,
|
|
||||||
"sources": sources,
|
|
||||||
"additional_links": additional_links,
|
|
||||||
"metadata": {
|
|
||||||
"websites_analyzed": len(allContent),
|
|
||||||
"additional_links_found": len(additional_links),
|
|
||||||
"crawl_depth": effective_depth,
|
|
||||||
"max_configured_depth": max_depth,
|
|
||||||
"max_links_per_domain": max_links_per_domain,
|
|
||||||
"crawl_timeout_minutes": crawl_timeout_minutes,
|
|
||||||
"total_urls_crawled": len(allContent),
|
|
||||||
"main_urls": len(selectedWebsites),
|
|
||||||
"additional_urls": len(additional_links)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"mimeType": "application/json"
|
|
||||||
}
|
|
||||||
|
|
||||||
return WebResearchResult(
|
|
||||||
success=True,
|
|
||||||
documents=[document]
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in web research: {str(e)}")
|
|
||||||
return WebResearchResult(success=False, error=str(e))
|
|
||||||
|
|
@ -10,7 +10,7 @@ from datetime import datetime, UTC
|
||||||
|
|
||||||
from modules.workflows.methods.methodBase import MethodBase, action
|
from modules.workflows.methods.methodBase import MethodBase, action
|
||||||
from modules.datamodels.datamodelChat import ActionResult
|
from modules.datamodels.datamodelChat import ActionResult
|
||||||
from modules.datamodels.datamodelAi import AiCallOptions
|
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
|
||||||
from modules.datamodels.datamodelChat import ChatDocument
|
from modules.datamodels.datamodelChat import ChatDocument
|
||||||
from modules.aicore.aicorePluginTavily import WebResearchRequest
|
from modules.aicore.aicorePluginTavily import WebResearchRequest
|
||||||
|
|
||||||
|
|
@ -28,6 +28,7 @@ class MethodAi(MethodBase):
|
||||||
"""Format current timestamp as YYYYMMDD-hhmmss for filenames."""
|
"""Format current timestamp as YYYYMMDD-hhmmss for filenames."""
|
||||||
return datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
return datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||||
|
|
||||||
|
|
||||||
@action
|
@action
|
||||||
async def process(self, parameters: Dict[str, Any]) -> ActionResult:
|
async def process(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
"""
|
"""
|
||||||
|
|
@ -161,93 +162,512 @@ class MethodAi(MethodBase):
|
||||||
error=str(e)
|
error=str(e)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@action
|
||||||
|
async def webSearch(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
|
"""
|
||||||
|
GENERAL:
|
||||||
|
- Purpose: Search the web and return a list of relevant URLs only.
|
||||||
|
- Input requirements: searchPrompt (required); optional maxResults, timeRange, country, language.
|
||||||
|
- Output format: JSON with search results and URLs.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- searchPrompt (str, required): Natural language search prompt describing what to search for.
|
||||||
|
- maxResults (int, optional): Maximum number of search results. Default: 5.
|
||||||
|
- timeRange (str, optional): d | w | m | y for time filtering.
|
||||||
|
- country (str, optional): Country name for localized results.
|
||||||
|
- language (str, optional): Language code (e.g., de, en, fr).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
searchPrompt = parameters.get("searchPrompt")
|
||||||
|
if not searchPrompt:
|
||||||
|
return ActionResult.isFailure(error="Search prompt is required")
|
||||||
|
|
||||||
|
# Extract optional parameters
|
||||||
|
maxResults = parameters.get("maxResults", 5)
|
||||||
|
timeRange = parameters.get("timeRange")
|
||||||
|
country = parameters.get("country")
|
||||||
|
language = parameters.get("language")
|
||||||
|
|
||||||
|
# Build AI call options for web search
|
||||||
|
options = AiCallOptions(
|
||||||
|
operationType=OperationTypeEnum.WEB_SEARCH,
|
||||||
|
resultFormat="json"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create unified prompt JSON that both Tavily and Perplexity can understand
|
||||||
|
promptData = {
|
||||||
|
"searchPrompt": searchPrompt,
|
||||||
|
"maxResults": maxResults,
|
||||||
|
"timeRange": timeRange,
|
||||||
|
"country": country,
|
||||||
|
"language": language,
|
||||||
|
"instructions": "Search the web and return a JSON response with a 'results' array containing objects with 'title', 'url', and optionally 'content' fields. Focus on finding relevant URLs for the search prompt."
|
||||||
|
}
|
||||||
|
|
||||||
|
import json
|
||||||
|
prompt = json.dumps(promptData, indent=2)
|
||||||
|
|
||||||
|
# Call AI service through unified path
|
||||||
|
result = await self.services.ai.callAiDocuments(
|
||||||
|
prompt=prompt,
|
||||||
|
documents=None,
|
||||||
|
options=options,
|
||||||
|
outputFormat="json"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process result to ensure consistent format
|
||||||
|
processedResult = self._processWebSearchResult(result)
|
||||||
|
|
||||||
|
# Create meaningful filename
|
||||||
|
meaningfulName = self._generateMeaningfulFileName(
|
||||||
|
base_name="web_search",
|
||||||
|
extension="json",
|
||||||
|
action_name="search"
|
||||||
|
)
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelChat import ActionDocument
|
||||||
|
actionDocument = ActionDocument(
|
||||||
|
documentName=meaningfulName,
|
||||||
|
documentData=processedResult,
|
||||||
|
mimeType="application/json"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ActionResult.isSuccess(documents=[actionDocument])
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in web search: {str(e)}")
|
||||||
|
return ActionResult.isFailure(error=str(e))
|
||||||
|
|
||||||
|
def _processWebSearchResult(self, result: str) -> str:
|
||||||
|
"""
|
||||||
|
Process web search result to ensure consistent JSON format with URL list.
|
||||||
|
Both Tavily and Perplexity now return proper JSON format.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import json
|
||||||
|
data = json.loads(result)
|
||||||
|
|
||||||
|
# If it's already a proper search result format, return as-is
|
||||||
|
if isinstance(data, dict) and "results" in data:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# If it's a different JSON format, try to extract URLs
|
||||||
|
if isinstance(data, dict):
|
||||||
|
# Look for URL patterns in the JSON
|
||||||
|
urls = self._extractUrlsFromJson(data)
|
||||||
|
if urls:
|
||||||
|
processedData = {
|
||||||
|
"query": data.get("query", "web search"),
|
||||||
|
"results": [{"title": f"Result {i+1}", "url": url} for i, url in enumerate(urls)],
|
||||||
|
"total_count": len(urls)
|
||||||
|
}
|
||||||
|
return json.dumps(processedData, indent=2)
|
||||||
|
|
||||||
|
# No URLs found, return original result in a structured format
|
||||||
|
processedData = {
|
||||||
|
"query": "web search",
|
||||||
|
"results": [],
|
||||||
|
"total_count": 0,
|
||||||
|
"raw_response": result
|
||||||
|
}
|
||||||
|
return json.dumps(processedData, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error processing web search result: {str(e)}")
|
||||||
|
# Return original result wrapped in error format
|
||||||
|
errorData = {
|
||||||
|
"query": "web search",
|
||||||
|
"results": [],
|
||||||
|
"total_count": 0,
|
||||||
|
"error": f"Failed to process result: {str(e)}",
|
||||||
|
"raw_response": result
|
||||||
|
}
|
||||||
|
return json.dumps(errorData, indent=2)
|
||||||
|
|
||||||
|
def _extractUrlsFromJson(self, data: Dict[str, Any]) -> List[str]:
|
||||||
|
"""Extract URLs from JSON data structure."""
|
||||||
|
urls = []
|
||||||
|
|
||||||
|
def _extractFromValue(value):
|
||||||
|
if isinstance(value, str):
|
||||||
|
# Check if it's a URL
|
||||||
|
if value.startswith(('http://', 'https://')):
|
||||||
|
urls.append(value)
|
||||||
|
elif isinstance(value, dict):
|
||||||
|
for v in value.values():
|
||||||
|
_extractFromValue(v)
|
||||||
|
elif isinstance(value, list):
|
||||||
|
for item in value:
|
||||||
|
_extractFromValue(item)
|
||||||
|
|
||||||
|
_extractFromValue(data)
|
||||||
|
return list(set(urls)) # Remove duplicates
|
||||||
|
|
||||||
|
|
||||||
|
@action
|
||||||
|
async def webCrawl(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
|
"""
|
||||||
|
GENERAL:
|
||||||
|
- Purpose: Extract content from specific URLs.
|
||||||
|
- Input requirements: urls (required); optional extractDepth, format.
|
||||||
|
- Output format: JSON with extracted content from URLs.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- urls (list, required): List of URLs to crawl and extract content from.
|
||||||
|
- extractDepth (str, optional): basic | advanced. Default: advanced.
|
||||||
|
- format (str, optional): markdown | html | text. Default: markdown.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
urls = parameters.get("urls")
|
||||||
|
if not urls or not isinstance(urls, list):
|
||||||
|
return ActionResult.isFailure(error="URLs list is required")
|
||||||
|
|
||||||
|
# Extract optional parameters
|
||||||
|
extractDepth = parameters.get("extractDepth", "advanced")
|
||||||
|
formatType = parameters.get("format", "markdown")
|
||||||
|
|
||||||
|
# Build AI call options for web crawling
|
||||||
|
options = AiCallOptions(
|
||||||
|
operationType=OperationTypeEnum.WEB_CRAWL,
|
||||||
|
resultFormat="json"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create unified prompt JSON for web crawling
|
||||||
|
promptData = {
|
||||||
|
"urls": urls,
|
||||||
|
"extractDepth": extractDepth,
|
||||||
|
"format": formatType,
|
||||||
|
"instructions": "Extract content from the provided URLs and return a JSON response with 'results' array containing objects with 'url', 'title', 'content', and 'extractedAt' fields."
|
||||||
|
}
|
||||||
|
|
||||||
|
import json
|
||||||
|
prompt = json.dumps(promptData, indent=2)
|
||||||
|
|
||||||
|
# Call AI service through unified path
|
||||||
|
result = await self.services.ai.callAiDocuments(
|
||||||
|
prompt=prompt,
|
||||||
|
documents=None,
|
||||||
|
options=options,
|
||||||
|
outputFormat="json"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create meaningful filename
|
||||||
|
meaningfulName = self._generateMeaningfulFileName(
|
||||||
|
base_name="web_crawl",
|
||||||
|
extension="json",
|
||||||
|
action_name="crawl"
|
||||||
|
)
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelChat import ActionDocument
|
||||||
|
actionDocument = ActionDocument(
|
||||||
|
documentName=meaningfulName,
|
||||||
|
documentData=result,
|
||||||
|
mimeType="application/json"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ActionResult.isSuccess(documents=[actionDocument])
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in web crawl: {str(e)}")
|
||||||
|
return ActionResult.isFailure(error=str(e))
|
||||||
|
|
||||||
|
|
||||||
@action
|
@action
|
||||||
async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult:
|
async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: Web research and information gathering with basic analysis and sources.
|
- Purpose: Comprehensive web research combining search and content extraction.
|
||||||
- Input requirements: user_prompt (required); optional urls, max_results, max_pages, search_depth, extract_depth, pages_search_depth, country, time_range, topic, language.
|
- Input requirements: researchPrompt (required); optional maxResults, urls, timeRange, country, language.
|
||||||
- Output format: JSON with results and sources.
|
- Output format: JSON with research results, sources, and analysis.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- user_prompt (str, required): Research question or topic.
|
- researchPrompt (str, required): Natural language research prompt describing what to research.
|
||||||
- urls (list, optional): Specific URLs to crawl.
|
- maxResults (int, optional): Maximum search results. Default: 5.
|
||||||
- max_results (int, optional): Max search results. Default: 5.
|
- urls (list, optional): Specific URLs to include in research.
|
||||||
- max_pages (int, optional): Max pages to crawl per site. Default: 5.
|
- timeRange (str, optional): d | w | m | y for time filtering.
|
||||||
- extract_depth (str, optional): basic | advanced. Default: advanced.
|
- country (str, optional): Country name for localized results.
|
||||||
- search_depth (int, optional): Crawl depth level - how many times to follow sublinks of a page. Default: 2.
|
|
||||||
- country (str, optional): Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries).
|
|
||||||
- time_range (str, optional): d | w | m | y.
|
|
||||||
- topic (str, optional): general | news | academic.
|
|
||||||
- language (str, optional): Language code (e.g., de, en, fr).
|
- language (str, optional): Language code (e.g., de, en, fr).
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
user_prompt = parameters.get("user_prompt")
|
researchPrompt = parameters.get("researchPrompt")
|
||||||
|
if not researchPrompt:
|
||||||
|
return ActionResult.isFailure(error="Research prompt is required")
|
||||||
|
|
||||||
|
# Extract optional parameters
|
||||||
|
maxResults = parameters.get("maxResults", 5)
|
||||||
urls = parameters.get("urls")
|
urls = parameters.get("urls")
|
||||||
max_results = parameters.get("max_results", 5)
|
timeRange = parameters.get("timeRange")
|
||||||
max_pages = parameters.get("max_pages", 5)
|
|
||||||
extract_depth = parameters.get("extract_depth", "advanced")
|
|
||||||
search_depth = parameters.get("pages_search_depth", 2)
|
|
||||||
country = parameters.get("country")
|
country = parameters.get("country")
|
||||||
time_range = parameters.get("time_range")
|
|
||||||
topic = parameters.get("topic")
|
|
||||||
language = parameters.get("language")
|
language = parameters.get("language")
|
||||||
|
|
||||||
if not user_prompt:
|
# Build AI call options for web research
|
||||||
return ActionResult.isFailure(
|
options = AiCallOptions(
|
||||||
error="Search query is required"
|
operationType=OperationTypeEnum.WEB_RESEARCH,
|
||||||
)
|
resultFormat="json"
|
||||||
|
|
||||||
# Build WebResearchRequest (simplified dataclass)
|
|
||||||
request = WebResearchRequest(
|
|
||||||
user_prompt=user_prompt,
|
|
||||||
urls=urls,
|
|
||||||
max_results=max_results,
|
|
||||||
max_pages=max_pages,
|
|
||||||
search_depth=search_depth,
|
|
||||||
extract_depth=extract_depth,
|
|
||||||
country=country,
|
|
||||||
time_range=time_range,
|
|
||||||
topic=topic,
|
|
||||||
language=language
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Call web research service
|
# Create unified prompt JSON for web research
|
||||||
logger.info(f"Performing comprehensive web research for: {user_prompt}")
|
promptData = {
|
||||||
logger.info(f"Max results: {max_results}, Max pages: {max_pages}")
|
"researchPrompt": researchPrompt,
|
||||||
if urls:
|
"maxResults": maxResults,
|
||||||
logger.info(f"Using provided URLs: {len(urls)}")
|
"urls": urls,
|
||||||
|
"timeRange": timeRange,
|
||||||
|
"country": country,
|
||||||
|
"language": language,
|
||||||
|
"instructions": "Conduct comprehensive web research and return a JSON response with 'results' array containing objects with 'title', 'url', 'content', and 'analysis' fields. Provide detailed analysis and insights."
|
||||||
|
}
|
||||||
|
|
||||||
result = await self.services.ai.webResearch(request)
|
import json
|
||||||
|
prompt = json.dumps(promptData, indent=2)
|
||||||
|
|
||||||
if not result.success:
|
# Call AI service through unified path
|
||||||
return ActionResult.isFailure(error=result.error)
|
result = await self.services.ai.callAiDocuments(
|
||||||
|
prompt=prompt,
|
||||||
# Convert WebResearchResult to ActionResult format
|
documents=None,
|
||||||
documents = []
|
options=options,
|
||||||
for doc in result.documents:
|
outputFormat="json"
|
||||||
documents.append({
|
|
||||||
"documentName": doc.documentName,
|
|
||||||
"documentData": {
|
|
||||||
"user_prompt": doc.documentData.user_prompt,
|
|
||||||
"websites_analyzed": doc.documentData.websites_analyzed,
|
|
||||||
"additional_links_found": doc.documentData.additional_links_found,
|
|
||||||
"analysis_result": doc.documentData.analysis_result,
|
|
||||||
"sources": [{"title": s.title, "url": str(s.url)} for s in doc.documentData.sources],
|
|
||||||
"additional_links": doc.documentData.additional_links,
|
|
||||||
"debug_info": doc.documentData.debug_info
|
|
||||||
},
|
|
||||||
"mimeType": doc.mimeType
|
|
||||||
})
|
|
||||||
|
|
||||||
# Return result in the standard ActionResult format
|
|
||||||
return ActionResult.isSuccess(
|
|
||||||
documents=documents
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Create meaningful filename
|
||||||
|
meaningfulName = self._generateMeaningfulFileName(
|
||||||
|
base_name="web_research",
|
||||||
|
extension="json",
|
||||||
|
action_name="research"
|
||||||
|
)
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelChat import ActionDocument
|
||||||
|
actionDocument = ActionDocument(
|
||||||
|
documentName=meaningfulName,
|
||||||
|
documentData=result,
|
||||||
|
mimeType="application/json"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ActionResult.isSuccess(documents=[actionDocument])
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in web research: {str(e)}")
|
logger.error(f"Error in web research: {str(e)}")
|
||||||
return ActionResult.isFailure(
|
return ActionResult.isFailure(error=str(e))
|
||||||
error=str(e)
|
|
||||||
|
|
||||||
|
@action
|
||||||
|
async def webQuestions(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
|
"""
|
||||||
|
GENERAL:
|
||||||
|
- Purpose: Answer questions using web research and AI analysis.
|
||||||
|
- Input requirements: question (required); optional context, maxResults, timeRange, country, language.
|
||||||
|
- Output format: JSON with question answer and supporting sources.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- question (str, required): Question to be answered using web research.
|
||||||
|
- context (str, optional): Additional context for the question.
|
||||||
|
- maxResults (int, optional): Maximum search results. Default: 5.
|
||||||
|
- timeRange (str, optional): d | w | m | y for time filtering.
|
||||||
|
- country (str, optional): Country name for localized results.
|
||||||
|
- language (str, optional): Language code (e.g., de, en, fr).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
question = parameters.get("question")
|
||||||
|
if not question:
|
||||||
|
return ActionResult.isFailure(error="Question is required")
|
||||||
|
|
||||||
|
# Extract optional parameters
|
||||||
|
context = parameters.get("context", "")
|
||||||
|
maxResults = parameters.get("maxResults", 5)
|
||||||
|
timeRange = parameters.get("timeRange")
|
||||||
|
country = parameters.get("country")
|
||||||
|
language = parameters.get("language")
|
||||||
|
|
||||||
|
# Build AI call options for web questions
|
||||||
|
options = AiCallOptions(
|
||||||
|
operationType=OperationTypeEnum.WEB_QUESTIONS,
|
||||||
|
resultFormat="json"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Create unified prompt JSON for web questions
|
||||||
|
promptData = {
|
||||||
|
"question": question,
|
||||||
|
"context": context,
|
||||||
|
"maxResults": maxResults,
|
||||||
|
"timeRange": timeRange,
|
||||||
|
"country": country,
|
||||||
|
"language": language,
|
||||||
|
"instructions": "Answer the question using web research and return a JSON response with 'answer', 'sources' array containing objects with 'title', 'url', 'content', and 'relevance' fields."
|
||||||
|
}
|
||||||
|
|
||||||
|
import json
|
||||||
|
prompt = json.dumps(promptData, indent=2)
|
||||||
|
|
||||||
|
# Call AI service through unified path
|
||||||
|
result = await self.services.ai.callAiDocuments(
|
||||||
|
prompt=prompt,
|
||||||
|
documents=None,
|
||||||
|
options=options,
|
||||||
|
outputFormat="json"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create meaningful filename
|
||||||
|
meaningfulName = self._generateMeaningfulFileName(
|
||||||
|
base_name="web_questions",
|
||||||
|
extension="json",
|
||||||
|
action_name="questions"
|
||||||
|
)
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelChat import ActionDocument
|
||||||
|
actionDocument = ActionDocument(
|
||||||
|
documentName=meaningfulName,
|
||||||
|
documentData=result,
|
||||||
|
mimeType="application/json"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ActionResult.isSuccess(documents=[actionDocument])
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in web questions: {str(e)}")
|
||||||
|
return ActionResult.isFailure(error=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@action
|
||||||
|
async def webNews(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
|
"""
|
||||||
|
GENERAL:
|
||||||
|
- Purpose: Search and analyze news articles on specific topics.
|
||||||
|
- Input requirements: newsPrompt (required); optional maxResults, timeRange, country, language.
|
||||||
|
- Output format: JSON with news articles, summaries, and analysis.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- newsPrompt (str, required): Natural language prompt describing what news to search for.
|
||||||
|
- maxResults (int, optional): Maximum news articles. Default: 5.
|
||||||
|
- timeRange (str, optional): d | w | m | y for time filtering. Default: w.
|
||||||
|
- country (str, optional): Country name for localized news.
|
||||||
|
- language (str, optional): Language code (e.g., de, en, fr).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
newsPrompt = parameters.get("newsPrompt")
|
||||||
|
if not newsPrompt:
|
||||||
|
return ActionResult.isFailure(error="News prompt is required")
|
||||||
|
|
||||||
|
# Extract optional parameters
|
||||||
|
maxResults = parameters.get("maxResults", 5)
|
||||||
|
timeRange = parameters.get("timeRange", "w") # Default to week
|
||||||
|
country = parameters.get("country")
|
||||||
|
language = parameters.get("language")
|
||||||
|
|
||||||
|
# Build AI call options for web news
|
||||||
|
options = AiCallOptions(
|
||||||
|
operationType=OperationTypeEnum.WEB_NEWS,
|
||||||
|
resultFormat="json"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create unified prompt JSON for web news
|
||||||
|
promptData = {
|
||||||
|
"newsPrompt": newsPrompt,
|
||||||
|
"maxResults": maxResults,
|
||||||
|
"timeRange": timeRange,
|
||||||
|
"country": country,
|
||||||
|
"language": language,
|
||||||
|
"instructions": "Find and analyze recent news articles and return a JSON response with 'articles' array containing objects with 'title', 'url', 'content', 'date', 'source', and 'summary' fields."
|
||||||
|
}
|
||||||
|
|
||||||
|
import json
|
||||||
|
prompt = json.dumps(promptData, indent=2)
|
||||||
|
|
||||||
|
# Call AI service through unified path
|
||||||
|
result = await self.services.ai.callAiDocuments(
|
||||||
|
prompt=prompt,
|
||||||
|
documents=None,
|
||||||
|
options=options,
|
||||||
|
outputFormat="json"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create meaningful filename
|
||||||
|
meaningfulName = self._generateMeaningfulFileName(
|
||||||
|
base_name="web_news",
|
||||||
|
extension="json",
|
||||||
|
action_name="news"
|
||||||
|
)
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelChat import ActionDocument
|
||||||
|
actionDocument = ActionDocument(
|
||||||
|
documentName=meaningfulName,
|
||||||
|
documentData=result,
|
||||||
|
mimeType="application/json"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ActionResult.isSuccess(documents=[actionDocument])
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in web news: {str(e)}")
|
||||||
|
return ActionResult.isFailure(error=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@action
|
||||||
|
async def generateImage(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
|
"""
|
||||||
|
GENERAL:
|
||||||
|
- Purpose: Generate images using AI based on text prompts.
|
||||||
|
- Input requirements: prompt (required); optional size, quality, style.
|
||||||
|
- Output format: Base64 encoded image data.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- prompt (str, required): Text description of the image to generate.
|
||||||
|
- size (str, optional): Image size. Options: 1024x1024, 1792x1024, 1024x1792. Default: 1024x1024.
|
||||||
|
- quality (str, optional): Image quality. Options: standard, hd. Default: standard.
|
||||||
|
- style (str, optional): Image style. Options: vivid, natural. Default: vivid.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
prompt = parameters.get("prompt")
|
||||||
|
if not prompt:
|
||||||
|
return ActionResult.isFailure(error="Image prompt is required")
|
||||||
|
|
||||||
|
# Extract optional parameters
|
||||||
|
size = parameters.get("size", "1024x1024")
|
||||||
|
quality = parameters.get("quality", "standard")
|
||||||
|
style = parameters.get("style", "vivid")
|
||||||
|
|
||||||
|
# Build AI call options for image generation
|
||||||
|
options = AiCallOptions(
|
||||||
|
operationType=OperationTypeEnum.IMAGE_GENERATE,
|
||||||
|
resultFormat="base64"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create unified prompt JSON for image generation
|
||||||
|
promptData = {
|
||||||
|
"prompt": prompt,
|
||||||
|
"size": size,
|
||||||
|
"quality": quality,
|
||||||
|
"style": style,
|
||||||
|
"instructions": "Generate an image based on the prompt and return the base64 encoded image data."
|
||||||
|
}
|
||||||
|
|
||||||
|
import json
|
||||||
|
promptJson = json.dumps(promptData, indent=2)
|
||||||
|
|
||||||
|
# Call AI service through unified path
|
||||||
|
result = await self.services.ai.callAiDocuments(
|
||||||
|
prompt=promptJson,
|
||||||
|
documents=None,
|
||||||
|
options=options,
|
||||||
|
outputFormat="base64"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create meaningful filename
|
||||||
|
meaningfulName = self._generateMeaningfulFileName(
|
||||||
|
base_name="generated_image",
|
||||||
|
extension="png",
|
||||||
|
action_name="generate"
|
||||||
|
)
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelChat import ActionDocument
|
||||||
|
actionDocument = ActionDocument(
|
||||||
|
documentName=meaningfulName,
|
||||||
|
documentData=result,
|
||||||
|
mimeType="image/png"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ActionResult.isSuccess(documents=[actionDocument])
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in image generation: {str(e)}")
|
||||||
|
return ActionResult.isFailure(error=str(e))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue