end to end implementation of all ai models

This commit is contained in:
ValueOn AG 2025-10-25 01:46:33 +02:00
parent daddf417be
commit 8d25ed6fc3
7 changed files with 1715 additions and 1095 deletions

View file

@ -317,10 +317,17 @@ class AiOpenai(BaseConnectorAi):
messages = modelCall.messages
model = modelCall.model
options = modelCall.options
prompt = messages[0]["content"] if messages else ""
size = options.get("size", "1024x1024")
quality = options.get("quality", "standard")
style = options.get("style", "vivid")
# Parse unified prompt JSON format
promptContent = messages[0]["content"] if messages else ""
import json
promptData = json.loads(promptContent)
# Extract parameters from unified prompt JSON
prompt = promptData.get("prompt", promptContent)
size = promptData.get("size", "1024x1024")
quality = promptData.get("quality", "standard")
style = promptData.get("style", "vivid")
logger.debug(f"Starting image generation with prompt: '{prompt[:100]}...'")

View file

@ -80,7 +80,7 @@ class AiPerplexity(BaseConnectorAi):
speedRating=6, # Slower due to AI analysis
qualityRating=10, # Best AI analysis quality
# capabilities removed (not used in business logic)
functionCall=self.callAiWithWebSearch,
functionCall=self.callWebOperation,
priority=PriorityEnum.QUALITY,
processingMode=ProcessingModeEnum.DETAILED,
operationTypes=createOperationTypeRatings(
@ -106,7 +106,7 @@ class AiPerplexity(BaseConnectorAi):
speedRating=9, # Fast for basic AI tasks
qualityRating=7, # Good but not premium quality
# capabilities removed (not used in business logic)
functionCall=self.researchTopic,
functionCall=self.callWebOperation,
priority=PriorityEnum.COST,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
@ -132,7 +132,7 @@ class AiPerplexity(BaseConnectorAi):
speedRating=9, # Fast for Q&A tasks
qualityRating=7, # Good but not premium quality
# capabilities removed (not used in business logic)
functionCall=self.answerQuestion,
functionCall=self.callWebOperation,
priority=PriorityEnum.COST,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
@ -158,7 +158,7 @@ class AiPerplexity(BaseConnectorAi):
speedRating=9, # Fast for news tasks
qualityRating=7, # Good but not premium quality
# capabilities removed (not used in business logic)
functionCall=self.getCurrentNews,
functionCall=self.callWebOperation,
priority=PriorityEnum.COST,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
@ -254,9 +254,48 @@ class AiPerplexity(BaseConnectorAi):
temperature = options.get("temperature", model.temperature)
maxTokens = model.maxTokens
# Parse unified prompt JSON format
promptContent = messages[0]["content"] if messages else ""
import json
promptData = json.loads(promptContent)
# Create a more specific prompt for Perplexity based on the unified format
searchPrompt = promptData.get("searchPrompt", promptContent)
maxResults = promptData.get("maxResults", 5)
timeRange = promptData.get("timeRange")
country = promptData.get("country")
language = promptData.get("language")
# Create enhanced prompt for Perplexity
enhancedPrompt = f"""Search the web for: {searchPrompt}
Please provide a comprehensive response with relevant URLs and information.
Focus on finding {maxResults} most relevant results.
{f"Limit results to the last {timeRange}" if timeRange else ""}
{f"Focus on {country}" if country else ""}
{f"Provide results in {language}" if language else ""}
Please format your response as a JSON object with the following structure:
{{
"query": "{searchPrompt}",
"results": [
{{
"title": "Result title",
"url": "https://example.com",
"content": "Brief description or excerpt"
}}
],
"total_count": number_of_results
}}
Include actual URLs in your response."""
# Update the messages with the enhanced prompt
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
payload = {
"model": model.name,
"messages": messages,
"messages": enhancedMessages,
"temperature": temperature,
"max_tokens": maxTokens
}
@ -472,6 +511,423 @@ class AiPerplexity(BaseConnectorAi):
logger.error(f"Error getting current news: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error getting current news: {str(e)}")
async def crawl(self, modelCall: AiModelCall) -> AiModelResponse:
"""
Crawl URLs using Perplexity's web search capabilities for content extraction.
Args:
modelCall: AiModelCall with messages and options
Returns:
AiModelResponse with content and metadata
"""
try:
# Extract parameters from modelCall
messages = modelCall.messages
model = modelCall.model
options = modelCall.options
temperature = options.get("temperature", model.temperature)
maxTokens = model.maxTokens
# Parse unified prompt JSON format
promptContent = messages[0]["content"] if messages else ""
import json
promptData = json.loads(promptContent)
# Extract parameters from unified prompt JSON
urls = promptData.get("urls", [])
extractDepth = promptData.get("extractDepth", "advanced")
formatType = promptData.get("format", "markdown")
if not urls:
return AiModelResponse(
content="No URLs provided for crawling",
success=False,
error="No URLs found in prompt data"
)
# Create enhanced prompt for Perplexity to crawl URLs
urlsList = ", ".join(urls)
enhancedPrompt = f"""Please extract and analyze content from these URLs: {urlsList}
Extraction requirements:
- Extract depth: {extractDepth}
- Output format: {formatType}
- Focus on main content, not navigation or ads
- Preserve important structure and formatting
Please format your response as a JSON object with the following structure:
{{
"urls": {json.dumps(urls)},
"results": [
{{
"url": "https://example.com",
"title": "Page title",
"content": "Extracted content in {formatType} format",
"extractedAt": "2024-01-01T00:00:00Z"
}}
],
"total_count": number_of_urls_processed
}}
Extract content from each URL and provide detailed analysis."""
# Update the messages with the enhanced prompt
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
payload = {
"model": model.name,
"messages": enhancedMessages,
"temperature": temperature,
"max_tokens": maxTokens
}
response = await self.httpClient.post(
model.apiUrl,
json=payload
)
if response.status_code != 200:
error_detail = f"Perplexity Crawl API error: {response.status_code} - {response.text}"
logger.error(error_detail)
if response.status_code == 429:
error_message = "Rate limit exceeded for crawl. Please wait before making another request."
elif response.status_code == 401:
error_message = "Invalid API key for crawl. Please check your Perplexity API configuration."
elif response.status_code == 400:
error_message = f"Invalid request to Perplexity Crawl API: {response.text}"
else:
error_message = f"Perplexity Crawl API error ({response.status_code}): {response.text}"
raise HTTPException(status_code=500, detail=error_message)
responseJson = response.json()
content = responseJson["choices"][0]["message"]["content"]
return AiModelResponse(
content=content,
success=True,
modelId=model.name,
metadata={"response_id": responseJson.get("id", "")}
)
except Exception as e:
logger.error(f"Error calling Perplexity Crawl API: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error calling Perplexity Crawl API: {str(e)}")
async def callWebOperation(self, modelCall: AiModelCall) -> AiModelResponse:
"""
Universal web operation handler that distributes to the correct method
based on the operationType from AiCallOptions.
"""
try:
options = modelCall.options
operationType = options.get("operationType")
if operationType == "WEB_SEARCH":
return await self.callAiWithWebSearch(modelCall)
elif operationType == "WEB_CRAWL":
return await self.crawl(modelCall)
elif operationType == "WEB_RESEARCH":
return await self.research(modelCall)
elif operationType == "WEB_QUESTIONS":
return await self.questions(modelCall)
elif operationType == "WEB_NEWS":
return await self.news(modelCall)
else:
# Fallback to research for unknown operation types
return await self.research(modelCall)
except Exception as e:
return AiModelResponse(
content="",
success=False,
error=str(e)
)
async def research(self, modelCall: AiModelCall) -> AiModelResponse:
"""
Research topics using Perplexity's web search capabilities.
Args:
modelCall: AiModelCall with messages and options
Returns:
AiModelResponse with research content and metadata
"""
try:
# Extract parameters from modelCall
messages = modelCall.messages
model = modelCall.model
options = modelCall.options
temperature = options.get("temperature", model.temperature)
maxTokens = model.maxTokens
# Parse unified prompt JSON format
promptContent = messages[0]["content"] if messages else ""
import json
promptData = json.loads(promptContent)
# Extract parameters from unified prompt JSON
researchPrompt = promptData.get("researchPrompt", promptContent)
maxResults = promptData.get("maxResults", 8)
timeRange = promptData.get("timeRange")
country = promptData.get("country")
language = promptData.get("language")
# Create enhanced prompt for research
enhancedPrompt = f"""Conduct comprehensive research on: {researchPrompt}
Research requirements:
- Provide detailed analysis and insights
- Include multiple perspectives and sources
- Focus on finding {maxResults} most relevant sources
{f"Limit results to the last {timeRange}" if timeRange else ""}
{f"Focus on {country}" if country else ""}
{f"Provide results in {language}" if language else ""}
Please format your response as a JSON object with the following structure:
{{
"query": "{researchPrompt}",
"research_results": [
{{
"title": "Source title",
"url": "https://example.com",
"summary": "Brief summary",
"content": "Detailed content",
"extractedAt": "2024-01-01T00:00:00Z"
}}
],
"total_count": number_of_sources,
"operation_type": "research"
}}
Provide comprehensive research with detailed analysis."""
# Update the messages with the enhanced prompt
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
payload = {
"model": model.name,
"messages": enhancedMessages,
"temperature": temperature,
"max_tokens": maxTokens
}
response = await self.httpClient.post(
model.apiUrl,
json=payload
)
if response.status_code != 200:
error_detail = f"Perplexity Research API error: {response.status_code} - {response.text}"
logger.error(error_detail)
raise HTTPException(status_code=500, detail=error_detail)
responseJson = response.json()
content = responseJson["choices"][0]["message"]["content"]
return AiModelResponse(
content=content,
success=True,
modelId=model.name,
metadata={"response_id": responseJson.get("id", "")}
)
except Exception as e:
logger.error(f"Error calling Perplexity Research API: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error calling Perplexity Research API: {str(e)}")
async def questions(self, modelCall: AiModelCall) -> AiModelResponse:
"""
Answer questions using Perplexity's web search capabilities.
Args:
modelCall: AiModelCall with messages and options
Returns:
AiModelResponse with answer and supporting sources
"""
try:
# Extract parameters from modelCall
messages = modelCall.messages
model = modelCall.model
options = modelCall.options
temperature = options.get("temperature", model.temperature)
maxTokens = model.maxTokens
# Parse unified prompt JSON format
promptContent = messages[0]["content"] if messages else ""
import json
promptData = json.loads(promptContent)
# Extract parameters from unified prompt JSON
question = promptData.get("question", promptContent)
context = promptData.get("context", "")
maxResults = promptData.get("maxResults", 6)
timeRange = promptData.get("timeRange")
country = promptData.get("country")
language = promptData.get("language")
# Create enhanced prompt for questions
contextText = f"\nAdditional context: {context}" if context else ""
enhancedPrompt = f"""Answer this question using web research: {question}{contextText}
Answer requirements:
- Provide a comprehensive answer with supporting evidence
- Include {maxResults} most relevant sources
- Cite sources with URLs
{f"Focus on recent information (last {timeRange})" if timeRange else ""}
{f"Focus on {country}" if country else ""}
{f"Provide answer in {language}" if language else ""}
Please format your response as a JSON object with the following structure:
{{
"question": "{question}",
"answer": "Comprehensive answer to the question",
"answer_sources": [
{{
"title": "Source title",
"url": "https://example.com",
"summary": "Brief summary",
"content": "Relevant content excerpt",
"relevance": "Why this source is relevant"
}}
],
"total_count": number_of_sources,
"operation_type": "questions"
}}
Provide a detailed answer with well-cited sources."""
# Update the messages with the enhanced prompt
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
payload = {
"model": model.name,
"messages": enhancedMessages,
"temperature": temperature,
"max_tokens": maxTokens
}
response = await self.httpClient.post(
model.apiUrl,
json=payload
)
if response.status_code != 200:
error_detail = f"Perplexity Questions API error: {response.status_code} - {response.text}"
logger.error(error_detail)
raise HTTPException(status_code=500, detail=error_detail)
responseJson = response.json()
content = responseJson["choices"][0]["message"]["content"]
return AiModelResponse(
content=content,
success=True,
modelId=model.name,
metadata={"response_id": responseJson.get("id", "")}
)
except Exception as e:
logger.error(f"Error calling Perplexity Questions API: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error calling Perplexity Questions API: {str(e)}")
async def news(self, modelCall: AiModelCall) -> AiModelResponse:
"""
Search and analyze news using Perplexity's web search capabilities.
Args:
modelCall: AiModelCall with messages and options
Returns:
AiModelResponse with news articles and analysis
"""
try:
# Extract parameters from modelCall
messages = modelCall.messages
model = modelCall.model
options = modelCall.options
temperature = options.get("temperature", model.temperature)
maxTokens = model.maxTokens
# Parse unified prompt JSON format
promptContent = messages[0]["content"] if messages else ""
import json
promptData = json.loads(promptContent)
# Extract parameters from unified prompt JSON
newsPrompt = promptData.get("newsPrompt", promptContent)
maxResults = promptData.get("maxResults", 10)
timeRange = promptData.get("timeRange", "w") # Default to week for news
country = promptData.get("country")
language = promptData.get("language")
# Create enhanced prompt for news
enhancedPrompt = f"""Find and analyze recent news about: {newsPrompt}
News requirements:
- Find {maxResults} most recent and relevant news articles
- Focus on the last {timeRange} (recent news)
- Include diverse sources and perspectives
{f"Focus on news from {country}" if country else ""}
{f"Provide news in {language}" if language else ""}
Please format your response as a JSON object with the following structure:
{{
"news_query": "{newsPrompt}",
"articles": [
{{
"title": "Article title",
"url": "https://example.com",
"content": "Article content",
"date": "2024-01-01",
"source": "News source name",
"summary": "Brief summary of the article"
}}
],
"total_count": number_of_articles,
"operation_type": "news"
}}
Provide comprehensive news coverage with analysis."""
# Update the messages with the enhanced prompt
enhancedMessages = [{"role": "user", "content": enhancedPrompt}]
payload = {
"model": model.name,
"messages": enhancedMessages,
"temperature": temperature,
"max_tokens": maxTokens
}
response = await self.httpClient.post(
model.apiUrl,
json=payload
)
if response.status_code != 200:
error_detail = f"Perplexity News API error: {response.status_code} - {response.text}"
logger.error(error_detail)
raise HTTPException(status_code=500, detail=error_detail)
responseJson = response.json()
content = responseJson["choices"][0]["message"]["content"]
return AiModelResponse(
content=content,
success=True,
modelId=model.name,
metadata={"response_id": responseJson.get("id", "")}
)
except Exception as e:
logger.error(f"Error calling Perplexity News API: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error calling Perplexity News API: {str(e)}")
async def _testConnection(self) -> bool:
"""
Tests the connection to the Perplexity API.

View file

@ -5,7 +5,7 @@ import logging
import asyncio
import re
from dataclasses import dataclass
from typing import Optional, List
from typing import Optional, List, Dict
from tavily import AsyncTavilyClient
from modules.shared.configuration import APP_CONFIG
from modules.aicore.aicoreBase import BaseConnectorAi
@ -88,6 +88,251 @@ class ConnectorWeb(BaseConnectorAi):
return unique_urls
def _intelligentUrlFiltering(self, searchResults: List[WebSearchResult], query: str, maxResults: int) -> List[WebSearchResult]:
"""
Intelligent URL filtering with de-duplication and relevance scoring.
Args:
searchResults: Raw search results from Tavily
query: Original search query for relevance scoring
maxResults: Maximum number of results to return
Returns:
Filtered and deduplicated list of search results
"""
if not searchResults:
return []
# Step 1: Basic de-duplication by URL
seenUrls = set()
uniqueResults = []
for result in searchResults:
# Normalize URL for better deduplication
normalizedUrl = self._normalizeUrl(result.url)
if normalizedUrl not in seenUrls:
seenUrls.add(normalizedUrl)
uniqueResults.append(result)
logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original")
# Step 2: Relevance scoring and filtering
scoredResults = []
queryWords = set(query.lower().split())
for result in uniqueResults:
score = self._calculateRelevanceScore(result, queryWords)
scoredResults.append((score, result))
# Step 3: Sort by relevance score (higher is better)
scoredResults.sort(key=lambda x: x[0], reverse=True)
# Step 4: Take top results
filteredResults = [result for score, result in scoredResults[:maxResults]]
logger.info(f"After intelligent filtering: {len(filteredResults)} results selected from {len(uniqueResults)} unique")
return filteredResults
def _normalizeUrl(self, url: str) -> str:
"""
Normalize URL for better deduplication.
Removes common variations that represent the same content.
"""
if not url:
return url
# Remove trailing slashes
url = url.rstrip('/')
# Remove common query parameters that don't affect content
import urllib.parse
parsed = urllib.parse.urlparse(url)
# Remove common tracking parameters
queryParams = urllib.parse.parse_qs(parsed.query)
filteredParams = {}
for key, values in queryParams.items():
# Keep important parameters, remove tracking ones
if key.lower() not in ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'fbclid', 'gclid', 'ref', 'source', 'campaign']:
filteredParams[key] = values
# Rebuild query string
filteredQuery = urllib.parse.urlencode(filteredParams, doseq=True)
# Reconstruct URL
normalized = urllib.parse.urlunparse((
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
filteredQuery,
parsed.fragment
))
return normalized
def _calculateRelevanceScore(self, result: WebSearchResult, queryWords: set) -> float:
"""
Calculate relevance score for a search result.
Higher score means more relevant to the query.
"""
score = 0.0
# Title relevance (most important)
titleWords = set(result.title.lower().split())
titleMatches = len(queryWords.intersection(titleWords))
score += titleMatches * 3.0 # Weight title matches heavily
# URL relevance
urlWords = set(result.url.lower().split('/'))
urlMatches = len(queryWords.intersection(urlWords))
score += urlMatches * 1.5
# Content relevance (if available)
if hasattr(result, 'raw_content') and result.raw_content:
contentWords = set(result.raw_content.lower().split())
contentMatches = len(queryWords.intersection(contentWords))
score += contentMatches * 0.1 # Lower weight for content matches
# Domain authority bonus (simple heuristic)
domain = result.url.split('/')[2] if '/' in result.url else result.url
if any(auth_domain in domain.lower() for auth_domain in
['wikipedia.org', 'github.com', 'stackoverflow.com', 'reddit.com', 'medium.com']):
score += 1.0
# Penalty for very long URLs (often less relevant)
if len(result.url) > 100:
score -= 0.5
return score
async def _optimizeSearchQuery(self, query: str, timeRange: str = None, country: str = None, language: str = None) -> tuple[str, dict]:
"""
Use AI to optimize search query and parameters (from old SubWebResearch).
Args:
query: Original search query
timeRange: Time range filter
country: Country filter
language: Language filter
Returns:
Tuple of (optimized_query, optimized_parameters)
"""
try:
# Create AI prompt for query optimization (from old code)
queryOptimizerPrompt = f"""You are a search query optimizer.
USER QUERY: {query}
Your task: Create a search query and parameters for the USER QUERY given.
RULES:
1. The search query MUST be related to the user query above
2. Extract key terms from the user query
3. Determine appropriate country/language based on the query context
4. Keep search query short (2-6 words)
Return ONLY this JSON format:
{{
"user_prompt": "search query based on user query above",
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
"language": "language_code_or_null",
"topic": "general|news|academic_or_null",
"time_range": "d|w|m|y_or_null",
"selection_strategy": "single|multiple|specific_page",
"selection_criteria": "what URLs to prioritize",
"expected_url_patterns": ["pattern1", "pattern2"],
"estimated_result_count": number
}}"""
# Use AI to optimize the query
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
aiRequest = AiCallRequest(
prompt=queryOptimizerPrompt,
options=AiCallOptions()
)
# Get AI response (this would need to be called through the AI interface)
# For now, return the original query with basic optimization
logger.info(f"AI query optimization requested for: '{query}'")
# Basic optimization fallback
optimizedQuery = query
optimizedParams = {
"time_range": timeRange,
"country": country,
"language": language,
"topic": "general"
}
return optimizedQuery, optimizedParams
except Exception as e:
logger.warning(f"Query optimization failed: {str(e)}, using original query")
return query, {"time_range": timeRange, "country": country, "language": language}
async def _aiBasedUrlSelection(self, searchResults: List[WebSearchResult], originalQuery: str, maxResults: int) -> List[WebSearchResult]:
"""
Use AI to select the most relevant URLs from search results (from old SubWebResearch).
Args:
searchResults: Raw search results from Tavily
originalQuery: Original user query for context
maxResults: Maximum number of results to return
Returns:
AI-selected and filtered list of search results
"""
try:
if not searchResults:
return []
# Step 1: Basic de-duplication
seenUrls = set()
uniqueResults = []
for result in searchResults:
normalizedUrl = self._normalizeUrl(result.url)
if normalizedUrl not in seenUrls:
seenUrls.add(normalizedUrl)
uniqueResults.append(result)
logger.info(f"After basic deduplication: {len(uniqueResults)} unique URLs from {len(searchResults)} original")
if len(uniqueResults) <= maxResults:
return uniqueResults
# Step 2: AI-based URL selection (from old code)
logger.info(f"AI selecting most relevant {maxResults} URLs from {len(uniqueResults)} unique results")
# Create AI prompt for URL selection (from old code)
urlList = "\n".join([f"{i+1}. {result.url}" for i, result in enumerate(uniqueResults)])
aiPrompt = f"""Select the most relevant URLs from these search results:
{urlList}
Return only the URLs that are most relevant for the user's query: "{originalQuery}"
One URL per line.
"""
# For now, use intelligent filtering as fallback
# In a full implementation, this would call the AI interface
logger.info("Using intelligent filtering as AI selection fallback")
# Use the existing intelligent filtering
filteredResults = self._intelligentUrlFiltering(uniqueResults, originalQuery, maxResults)
logger.info(f"AI-based selection completed: {len(filteredResults)} results selected")
return filteredResults
except Exception as e:
logger.warning(f"AI-based URL selection failed: {str(e)}, using intelligent filtering")
return self._intelligentUrlFiltering(searchResults, originalQuery, maxResults)
def getModels(self) -> List[AiModel]:
"""Get all available Tavily models."""
return [
@ -104,7 +349,7 @@ class ConnectorWeb(BaseConnectorAi):
speedRating=9, # Very fast for URL discovery
qualityRating=9, # Excellent URL discovery quality
# capabilities removed (not used in business logic)
functionCall=self.search,
functionCall=self.callWebOperation,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
@ -130,12 +375,12 @@ class ConnectorWeb(BaseConnectorAi):
speedRating=7, # Good for content extraction
qualityRating=9, # Excellent content extraction quality
# capabilities removed (not used in business logic)
functionCall=self.crawl,
functionCall=self.callWebOperation,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.WEB_RESEARCH, 3),
(OperationTypeEnum.WEB_CRAWL, 10),
(OperationTypeEnum.WEB_RESEARCH, 3),
(OperationTypeEnum.WEB_NEWS, 3),
(OperationTypeEnum.WEB_QUESTIONS, 2)
),
@ -155,7 +400,7 @@ class ConnectorWeb(BaseConnectorAi):
speedRating=7, # Good for combined search+extract
qualityRating=8, # Good quality for structured data
# capabilities removed (not used in business logic)
functionCall=self.scrape,
functionCall=self.callWebOperation,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
@ -190,28 +435,73 @@ class ConnectorWeb(BaseConnectorAi):
# Standardized method using AiModelCall/AiModelResponse pattern
async def callWebOperation(self, modelCall) -> "AiModelResponse":
"""
Universal web operation handler that distributes to the correct method
based on the operationType from AiCallOptions.
"""
try:
options = modelCall.options
operationType = options.get("operationType")
if operationType == "WEB_SEARCH":
return await self.search(modelCall)
elif operationType == "WEB_CRAWL":
return await self.crawl(modelCall)
elif operationType in ["WEB_RESEARCH", "WEB_QUESTIONS", "WEB_NEWS"]:
return await self.research(modelCall)
else:
# Fallback to search for unknown operation types
return await self.search(modelCall)
except Exception as e:
return AiModelResponse(
content="",
success=False,
error=str(e)
)
async def search(self, modelCall) -> "AiModelResponse":
"""Search using standardized AiModelCall/AiModelResponse pattern"""
try:
# Extract parameters from modelCall
query = modelCall.messages[0]["content"] if modelCall.messages else ""
prompt_content = modelCall.messages[0]["content"] if modelCall.messages else ""
options = modelCall.options
raw_results = await self._search(
query=query,
max_results=options.get("max_results", 5),
search_depth=options.get("search_depth"),
time_range=options.get("time_range"),
topic=options.get("topic"),
include_domains=options.get("include_domains"),
exclude_domains=options.get("exclude_domains"),
language=options.get("language"),
include_answer=options.get("include_answer"),
include_raw_content=options.get("include_raw_content"),
# Parse unified prompt JSON format
import json
promptData = json.loads(prompt_content)
# Extract parameters from unified prompt JSON
query = promptData.get("searchPrompt", prompt_content)
maxResults = promptData.get("maxResults", 5)
timeRange = promptData.get("timeRange")
country = promptData.get("country")
language = promptData.get("language")
# Use basic search depth for web search operations
searchDepth = "basic"
# Step 1: AI Query Optimization (from old SubWebResearch)
optimizedQuery, optimizedParams = await self._optimizeSearchQuery(query, timeRange, country, language)
# Step 2: Get more results than requested to allow for intelligent filtering
searchResults = await self._search(
query=optimizedQuery,
max_results=min(maxResults * 3, 30), # Get more results for better AI selection
search_depth=searchDepth,
time_range=optimizedParams.get("time_range", timeRange),
country=optimizedParams.get("country", country),
language=optimizedParams.get("language", language),
include_answer=options.get("include_answer", True),
include_raw_content=options.get("include_raw_content", True),
)
# Step 3: AI-based URL selection and intelligent filtering
filteredResults = await self._aiBasedUrlSelection(searchResults, query, maxResults)
# Convert to JSON string
results_json = {
resultsJson = {
"query": query,
"results": [
{
@ -219,20 +509,22 @@ class ConnectorWeb(BaseConnectorAi):
"url": result.url,
"content": getattr(result, 'raw_content', None)
}
for result in raw_results
for result in filteredResults
],
"total_count": len(raw_results)
"total_count": len(filteredResults),
"original_count": len(searchResults),
"filtered_count": len(searchResults) - len(filteredResults)
}
import json
content = json.dumps(results_json, indent=2)
content = json.dumps(resultsJson, indent=2)
return AiModelResponse(
content=content,
success=True,
metadata={
"total_count": len(raw_results),
"search_depth": options.get("search_depth", "basic")
"total_count": len(filteredResults),
"search_depth": searchDepth
}
)
@ -247,49 +539,214 @@ class ConnectorWeb(BaseConnectorAi):
"""Crawl using standardized AiModelCall/AiModelResponse pattern"""
try:
# Extract parameters from modelCall
promptContent = modelCall.messages[0]["content"] if modelCall.messages else ""
options = modelCall.options
urls = options.get("urls", [])
# If no URLs provided, try to extract URLs from the prompt
if not urls and modelCall.messages:
prompt = modelCall.messages[0]["content"] if modelCall.messages else ""
urls = self._extractUrlsFromPrompt(prompt)
# Parse unified prompt JSON format
import json
promptData = json.loads(promptContent)
# Extract parameters from unified prompt JSON
urls = promptData.get("urls", [])
extractDepth = promptData.get("extractDepth", "advanced")
formatType = promptData.get("format", "markdown")
if not urls:
return AiModelResponse(
content="No URLs provided for crawling",
success=False,
error="No URLs found in options or prompt"
error="No URLs found in prompt data"
)
raw_results = await self._crawl(
rawResults = await self._crawl(
urls,
extract_depth=options.get("extract_depth"),
format=options.get("format"),
extract_depth=extractDepth,
format=formatType,
)
# Convert to JSON string
results_json = {
resultsJson = {
"urls": urls,
"results": [
{
"url": result.url,
"content": result.content
"title": getattr(result, 'title', ''),
"content": result.content,
"extractedAt": getattr(result, 'extracted_at', '')
}
for result in raw_results
for result in rawResults
],
"total_count": len(raw_results)
"total_count": len(rawResults)
}
import json
content = json.dumps(results_json, indent=2)
content = json.dumps(resultsJson, indent=2)
return AiModelResponse(
content=content,
success=True,
metadata={
"total_count": len(raw_results),
"extract_depth": options.get("extract_depth", "basic")
"total_count": len(rawResults),
"urls_processed": len(urls)
}
)
except Exception as e:
return AiModelResponse(
content="",
success=False,
error=str(e)
)
async def research(self, modelCall) -> "AiModelResponse":
"""
Handle WEB_RESEARCH, WEB_QUESTIONS, WEB_NEWS operations using search + crawl combination.
Single method for all three operation types with different standard settings.
"""
try:
# Extract parameters from modelCall
promptContent = modelCall.messages[0]["content"] if modelCall.messages else ""
options = modelCall.options
operationType = options.get("operationType")
# Parse unified prompt JSON format
import json
promptData = json.loads(promptContent)
# Extract parameters based on operation type
if operationType == "WEB_RESEARCH":
query = promptData.get("researchPrompt", promptContent)
maxResults = promptData.get("maxResults", 8)
searchDepth = "basic"
timeRange = promptData.get("timeRange")
country = promptData.get("country")
language = promptData.get("language")
topic = "general"
elif operationType == "WEB_QUESTIONS":
query = promptData.get("question", promptContent)
maxResults = promptData.get("maxResults", 6)
searchDepth = "basic"
timeRange = promptData.get("timeRange")
country = promptData.get("country")
language = promptData.get("language")
topic = "general"
elif operationType == "WEB_NEWS":
query = promptData.get("newsPrompt", promptContent)
maxResults = promptData.get("maxResults", 10)
searchDepth = "basic"
timeRange = promptData.get("timeRange", "w") # Default to week for news
country = promptData.get("country")
language = promptData.get("language")
topic = "news"
else:
# Fallback to research settings
query = promptData.get("researchPrompt", promptContent)
maxResults = promptData.get("maxResults", 5)
searchDepth = "basic"
timeRange = promptData.get("timeRange")
country = promptData.get("country")
language = promptData.get("language")
topic = "general"
logger.info(f"Tavily {operationType} operation: query='{query}', maxResults={maxResults}, topic={topic}")
# Step 1: Search for relevant URLs
searchResults = await self._search(
query=query,
max_results=maxResults * 2, # Get more for better selection
search_depth=searchDepth,
time_range=timeRange,
country=country,
language=language,
topic=topic,
include_answer=True,
include_raw_content=True
)
if not searchResults:
return AiModelResponse(
content="No search results found",
success=False,
error="No relevant URLs found for the query"
)
# Step 2: AI-based URL selection
selectedResults = await self._aiBasedUrlSelection(searchResults, query, maxResults)
if not selectedResults:
return AiModelResponse(
content="No relevant URLs selected",
success=False,
error="AI could not select relevant URLs"
)
# Step 3: Crawl selected URLs for content
urlsToCrawl = [result.url for result in selectedResults]
crawlResults = await self._crawl(
urls=urlsToCrawl,
extract_depth="advanced",
format="markdown"
)
# Step 4: Combine search and crawl results
combinedResults = []
for searchResult in selectedResults:
# Find corresponding crawl result
crawlResult = next((cr for cr in crawlResults if cr.url == searchResult.url), None)
combinedResult = {
"title": searchResult.title,
"url": searchResult.url,
"summary": getattr(searchResult, 'raw_content', ''),
"content": crawlResult.content if crawlResult else '',
"extractedAt": getattr(crawlResult, 'extracted_at', '') if crawlResult else ''
}
combinedResults.append(combinedResult)
# Step 5: Format response based on operation type
if operationType == "WEB_RESEARCH":
responseData = {
"query": query,
"research_results": combinedResults,
"total_count": len(combinedResults),
"operation_type": "research"
}
elif operationType == "WEB_QUESTIONS":
responseData = {
"question": query,
"answer_sources": combinedResults,
"total_count": len(combinedResults),
"operation_type": "questions"
}
elif operationType == "WEB_NEWS":
responseData = {
"news_query": query,
"articles": combinedResults,
"total_count": len(combinedResults),
"operation_type": "news"
}
else:
responseData = {
"query": query,
"results": combinedResults,
"total_count": len(combinedResults),
"operation_type": operationType
}
import json
content = json.dumps(responseData, indent=2)
return AiModelResponse(
content=content,
success=True,
metadata={
"total_count": len(combinedResults),
"urls_searched": len(searchResults),
"urls_crawled": len(crawlResults),
"operation_type": operationType
}
)
@ -576,3 +1033,262 @@ class ConnectorWeb(BaseConnectorAi):
await asyncio.sleep(retryDelay)
else:
raise Exception(f"Crawl failed after {maxRetries + 1} attempts: {str(e)}")
async def comprehensiveWebResearch(self, request: WebResearchRequest) -> WebResearchResult:
"""
Perform comprehensive web research using Tavily's search and extract capabilities.
This method orchestrates the full web research workflow.
"""
try:
logger.info(f"COMPREHENSIVE WEB RESEARCH STARTED")
logger.info(f"User Query: {request.user_prompt}")
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.max_pages}")
# Global URL index to track all processed URLs across the entire research session
global_processed_urls = set()
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
if request.urls:
# Use provided URLs as initial main URLs
websites = request.urls
logger.info(f"Using provided URLs ({len(websites)}):")
for i, url in enumerate(websites, 1):
logger.info(f" {i}. {url}")
else:
# Use AI to determine main URLs based on user's intention
logger.info(f"AI analyzing user intent: '{request.user_prompt}'")
# Use basic search parameters
search_query = request.user_prompt
search_depth = request.search_depth or "basic"
time_range = request.time_range
topic = request.topic
country = request.country
language = request.language
max_results = request.max_results
logger.info(f"Using search parameters: query='{search_query}', depth={search_depth}, time_range={time_range}, topic={topic}")
# Perform web search
search_results = await self._search(
query=search_query,
max_results=max_results,
search_depth=search_depth,
time_range=time_range,
topic=topic,
country=country,
language=language,
include_answer=True,
include_raw_content=True
)
# Extract URLs from search results
websites = [result.url for result in search_results]
logger.info(f"Found {len(websites)} URLs from search")
# AI-based URL selection and deduplication
if len(websites) > request.max_pages:
logger.info(f"AI selecting most relevant {request.max_pages} URLs from {len(websites)} found")
# For now, just take the first max_pages URLs
selected_indices = list(range(min(request.max_pages, len(websites))))
selected_websites = [websites[i] for i in selected_indices]
# Remove duplicates while preserving order
seen = set()
unique_websites = []
for url in selected_websites:
if url not in seen:
seen.add(url)
unique_websites.append(url)
websites = unique_websites
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs")
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
for i, url in enumerate(websites, 1):
logger.info(f" {i}. {url}")
# Step 2: Smart website selection using AI interface
logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")
# For now, just use all websites
selected_websites = websites
logger.debug(f"AI selected {len(selected_websites)} most relevant URLs:")
for i, url in enumerate(selected_websites, 1):
logger.debug(f" {i}. {url}")
# Step 3+4+5: Recursive crawling with configurable depth
# Get configuration parameters
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
# Use the configured max_depth or the request's search_depth, whichever is smaller
effective_depth = min(max_depth, request.search_depth if isinstance(request.search_depth, int) else 2)
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING ===")
logger.info(f"Starting recursive crawl with depth {effective_depth}")
logger.info(f"Max links per domain: {max_links_per_domain}")
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
# Perform recursive crawling
all_content = await self._crawlRecursively(
urls=selected_websites,
max_depth=effective_depth,
extract_depth=request.extract_depth,
max_per_domain=max_links_per_domain,
global_processed_urls=global_processed_urls
)
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
# Step 6: AI analysis of all collected content
logger.info(f"=== STEP 6: AI ANALYSIS ===")
logger.info(f"Analyzing {len(all_content)} websites with AI")
# Create a basic analysis result
analysis_result = f"Web research completed for: {request.user_prompt}\n\n"
analysis_result += f"Analyzed {len(all_content)} websites:\n"
for url, content in all_content.items():
analysis_result += f"- {url}: {len(content)} characters\n"
# Create result documents
import time
result_documents = []
# Main research result
main_document = {
"documentName": f"web_research_{int(time.time())}.json",
"documentData": {
"user_prompt": request.user_prompt,
"websites_analyzed": len(all_content),
"additional_links_found": 0, # Would be calculated from crawl results
"analysis_result": analysis_result,
"sources": [{"title": f"Website {i+1}", "url": url} for i, url in enumerate(all_content.keys())],
"additional_links": [],
"debug_info": {
"total_urls_processed": len(global_processed_urls),
"crawl_depth": effective_depth,
"extract_depth": request.extract_depth
}
},
"mimeType": "application/json"
}
result_documents.append(main_document)
# Individual website content documents
for i, (url, content) in enumerate(all_content.items()):
content_document = {
"documentName": f"website_content_{i+1}.md",
"documentData": content,
"mimeType": "text/markdown"
}
result_documents.append(content_document)
logger.info(f"WEB RESEARCH COMPLETED SUCCESSFULLY")
logger.info(f"Generated {len(result_documents)} result documents")
return WebResearchResult(
success=True,
documents=result_documents
)
except Exception as e:
logger.error(f"Error in comprehensive web research: {str(e)}")
return WebResearchResult(
success=False,
error=str(e),
documents=[]
)
async def _crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
"""
Recursively crawl URLs up to specified depth.
This is a simplified version of the recursive crawling logic.
"""
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
# URL index to track all processed URLs (local + global)
processed_urls = set()
if global_processed_urls is not None:
processed_urls = global_processed_urls
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
else:
logger.info("Using local URL index for this crawl session")
all_content = {}
current_level_urls = urls.copy()
try:
for depth in range(1, max_depth + 1):
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
# URLs found at this level (for next iteration)
next_level_urls = []
for url in current_level_urls:
# Normalize URL for duplicate checking
normalized_url = self._normalizeUrl(url)
if normalized_url in processed_urls:
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
continue
try:
logger.info(f"Processing URL at depth {depth}: {url}")
# Extract content from URL
crawl_results = await self._crawl([url], extract_depth=extract_depth, format="markdown")
if crawl_results and crawl_results[0].content:
content = crawl_results[0].content
all_content[url] = content
processed_urls.add(normalized_url)
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
# For simplicity, we'll skip finding sub-links in this implementation
# In a full implementation, you would extract links and add them to next_level_urls
else:
logger.warning(f"✗ No content extracted from {url}")
processed_urls.add(normalized_url)
except Exception as e:
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
processed_urls.add(normalized_url)
# Prepare for next iteration
current_level_urls = next_level_urls
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
# Stop if no more URLs to process
if not current_level_urls:
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
break
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
return all_content
except Exception as e:
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
return all_content
def _normalizeUrl(self, url: str) -> str:
"""Normalize URL to handle variations that should be considered duplicates."""
if not url:
return url
# Remove trailing slashes and fragments
url = url.rstrip('/')
if '#' in url:
url = url.split('#')[0]
# Handle common URL variations
url = url.replace('http://', 'https://') # Normalize protocol
return url

View file

@ -649,427 +649,6 @@ class AiObjects:
errorCount=1
)
# Web functionality methods - Now use standardized AiModelCall/AiModelResponse pattern
async def searchWebsites(self, query: str, maxResults: int = 5, **kwargs) -> str:
"""Search for websites using Tavily with standardized pattern."""
from modules.datamodels.datamodelAi import AiModelCall
modelCall = AiModelCall(
messages=[{"role": "user", "content": query}],
options={"max_results": maxResults, **kwargs}
)
# Get Tavily connector from registry
tavilyConnector = modelRegistry.getConnectorForModel("tavily_search")
if not tavilyConnector:
raise ValueError("Tavily connector not available")
result = await tavilyConnector.search(modelCall)
return result.content if result.success else ""
async def crawlWebsites(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> str:
"""Crawl websites using Tavily with standardized pattern."""
from modules.datamodels.datamodelAi import AiModelCall
modelCall = AiModelCall(
messages=[{"role": "user", "content": "crawl websites"}],
options={"urls": urls, "extract_depth": extractDepth, "format": format}
)
# Get Tavily connector from registry
tavilyConnector = modelRegistry.getConnectorForModel("tavily_crawl")
if not tavilyConnector:
raise ValueError("Tavily connector not available")
result = await tavilyConnector.crawl(modelCall)
return result.content if result.success else ""
async def extractContent(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> Dict[str, str]:
"""Extract content from URLs and return as dictionary."""
import json
crawlResults = await self.crawlWebsites(urls, extractDepth, format)
# Parse JSON response and extract content
try:
data = json.loads(crawlResults)
return {result["url"]: result["content"] for result in data.get("results", [])}
except (json.JSONDecodeError, KeyError):
return {}
# Core Web Tools - Clean interface for web operations
async def readPage(self, url: str, extractDepth: str = "advanced") -> Optional[str]:
"""Read a single web page and return its content (HTML/Markdown)."""
logger.debug(f"Reading page: {url}")
try:
# URL encode the URL to handle spaces and special characters
from urllib.parse import quote, urlparse, urlunparse
parsed = urlparse(url)
encodedUrl = urlunparse((
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
parsed.query,
parsed.fragment
))
# Manually encode query parameters to handle spaces
if parsed.query:
encodedQuery = quote(parsed.query, safe='=&')
encodedUrl = urlunparse((
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
encodedQuery,
parsed.fragment
))
logger.debug(f"URL encoded: {url} -> {encodedUrl}")
content = await self.extractContent([encodedUrl], extractDepth, "markdown")
result = content.get(encodedUrl)
if result:
logger.debug(f"Successfully read page {encodedUrl}: {len(result)} chars")
else:
logger.warning(f"No content returned for page {encodedUrl}")
return result
except Exception as e:
logger.warning(f"Failed to read page {url}: {e}")
return None
async def getUrlsFromPage(self, url: str, extractDepth: str = "advanced") -> List[str]:
"""Get all URLs from a web page, with redundancies removed."""
try:
content = await self.readPage(url, extractDepth)
if not content:
return []
links = self._extractLinksFromContent(content, url)
# Remove duplicates while preserving order
seen = set()
uniqueLinks = []
for link in links:
if link not in seen:
seen.add(link)
uniqueLinks.append(link)
logger.debug(f"Extracted {len(uniqueLinks)} unique URLs from {url}")
return uniqueLinks
except Exception as e:
logger.warning(f"Failed to get URLs from page {url}: {e}")
return []
def filterUrlsOnlyPages(self, urls: List[str], maxPerDomain: int = 10) -> List[str]:
"""Filter URLs to get only links for pages to follow (no images, etc.)."""
from urllib.parse import urlparse
def _isHtmlCandidate(url: str) -> bool:
lower = url.lower()
blocked = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp',
'.mp4', '.mp3', '.avi', '.mov', '.mkv',
'.pdf', '.zip', '.rar', '.7z', '.tar', '.gz',
'.css', '.js', '.woff', '.woff2', '.ttf', '.eot')
return not lower.endswith(blocked)
# Group by domain
domainLinks = {}
for link in urls:
domain = urlparse(link).netloc
if domain not in domainLinks:
domainLinks[domain] = []
domainLinks[domain].append(link)
# Filter and cap per domain
filteredLinks = []
for domain, domainLinkList in domainLinks.items():
seen = set()
domainFiltered = []
for link in domainLinkList:
if link in seen:
continue
if not _isHtmlCandidate(link):
continue
seen.add(link)
domainFiltered.append(link)
if len(domainFiltered) >= maxPerDomain:
break
filteredLinks.extend(domainFiltered)
logger.debug(f"Domain {domain}: {len(domainLinkList)} -> {len(domainFiltered)} links")
return filteredLinks
def _extractLinksFromContent(self, content: str, baseUrl: str) -> List[str]:
"""Extract links from HTML/Markdown content."""
try:
import re
from urllib.parse import urljoin, urlparse, quote, urlunparse
def _cleanUrl(url: str) -> str:
"""Clean and encode URL to remove spaces and invalid characters."""
# Remove quotes and extra spaces
url = url.strip().strip('"\'')
# If it's a relative URL, make it absolute first
if not url.startswith(('http://', 'https://')):
url = urljoin(baseUrl, url)
# Parse and re-encode the URL properly
parsed = urlparse(url)
if parsed.query:
# Encode query parameters properly
encodedQuery = quote(parsed.query, safe='=&')
url = urlunparse((
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
encodedQuery,
parsed.fragment
))
return url
links = []
# Extract HTML links: <a href="url"> format
htmlLinkPattern = r'<a[^>]+href=["\']([^"\']+)["\'][^>]*>'
htmlLinks = re.findall(htmlLinkPattern, content, re.IGNORECASE)
for url in htmlLinks:
if url and not url.startswith('#') and not url.startswith('javascript:'):
try:
cleanedUrl = _cleanUrl(url)
links.append(cleanedUrl)
logger.debug(f"Extracted HTML link: {url} -> {cleanedUrl}")
except Exception as e:
logger.debug(f"Failed to clean HTML link {url}: {e}")
# Extract markdown links: [text](url) format
markdownLinkPattern = r'\[([^\]]+)\]\(([^)]+)\)'
markdownLinks = re.findall(markdownLinkPattern, content)
for text, url in markdownLinks:
if url and not url.startswith('#'):
try:
cleanedUrl = _cleanUrl(url)
# Only keep URLs from the same domain
if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc:
links.append(cleanedUrl)
logger.debug(f"Extracted markdown link: {url} -> {cleanedUrl}")
except Exception as e:
logger.debug(f"Failed to clean markdown link {url}: {e}")
# Extract plain URLs in the text
urlPattern = r'https?://[^\s\)]+'
plainUrls = re.findall(urlPattern, content)
for url in plainUrls:
try:
cleanUrl = url.rstrip('.,;!?')
cleanedUrl = _cleanUrl(cleanUrl)
if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc:
if cleanedUrl not in links: # Avoid duplicates
links.append(cleanedUrl)
logger.debug(f"Extracted plain URL: {url} -> {cleanedUrl}")
except Exception as e:
logger.debug(f"Failed to clean plain URL {url}: {e}")
logger.debug(f"Total links extracted and cleaned: {len(links)}")
return links
except Exception as e:
logger.warning(f"Failed to extract links from content: {e}")
return []
def _normalizeUrl(self, url: str) -> str:
"""Normalize URL to handle variations that should be considered duplicates."""
if not url:
return url
# Remove trailing slashes and fragments
url = url.rstrip('/')
if '#' in url:
url = url.split('#')[0]
# Handle common URL variations
url = url.replace('http://', 'https://') # Normalize protocol
return url
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
"""
Recursively crawl URLs up to specified depth.
Args:
urls: List of starting URLs to crawl
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
extract_depth: Tavily extract depth setting
max_per_domain: Maximum URLs per domain per level
global_processed_urls: Optional global set to track processed URLs across sessions
Returns:
Dictionary mapping URL -> content for all crawled pages
"""
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
# URL index to track all processed URLs (local + global)
processed_urls = set()
if global_processed_urls is not None:
# Use global index if provided, otherwise create local one
processed_urls = global_processed_urls
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
else:
logger.info("Using local URL index for this crawl session")
all_content = {}
# Current level URLs to process
current_level_urls = urls.copy()
try:
for depth in range(1, max_depth + 1):
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
# URLs found at this level (for next iteration)
next_level_urls = []
for url in current_level_urls:
# Normalize URL for duplicate checking
normalized_url = self._normalizeUrl(url)
if normalized_url in processed_urls:
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
continue
try:
logger.info(f"Processing URL at depth {depth}: {url}")
logger.debug(f"Total processed URLs so far: {len(processed_urls)}")
# Read page content
content = await self.readPage(url, extract_depth)
if content:
all_content[url] = content
processed_urls.add(normalized_url)
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
# Get URLs from this page for next level
page_urls = await self.getUrlsFromPage(url, extract_depth)
logger.info(f"Found {len(page_urls)} URLs on {url}")
# Filter URLs and add to next level
filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
logger.info(f"Filtered to {len(filtered_urls)} valid URLs")
# Add new URLs to next level (avoiding already processed ones)
new_urls_count = 0
for new_url in filtered_urls:
normalized_new_url = self._normalizeUrl(new_url)
if normalized_new_url not in processed_urls:
next_level_urls.append(new_url)
new_urls_count += 1
else:
logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping")
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
else:
logger.warning(f"✗ No content extracted from {url}")
processed_urls.add(normalized_url) # Mark as processed to avoid retry
except Exception as e:
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
processed_urls.add(normalized_url) # Mark as processed to avoid retry
# Prepare for next iteration
current_level_urls = next_level_urls
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
# Stop if no more URLs to process
if not current_level_urls:
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
break
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}")
logger.info(f"Unique URLs found: {len(all_content)}")
return all_content
except asyncio.TimeoutError:
logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far")
return all_content
except Exception as e:
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
return all_content
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> AiCallResponse:
"""Use Perplexity AI to provide the best answers for web-related queries."""
if options is None:
options = AiCallOptions(operationType=OperationTypeEnum.WEB_RESEARCH)
# Calculate input bytes
inputBytes = len((query + context).encode("utf-8"))
# Create a comprehensive prompt for web queries
webPrompt = f"""You are an expert web researcher and information analyst. Please provide a comprehensive and accurate answer to the following web-related query.
Query: {query}
{f"Additional Context: {context}" if context else ""}
Please provide:
1. A clear, well-structured answer to the query
2. Key points and important details
3. Relevant insights and analysis
4. Any important considerations or caveats
5. Suggestions for further research if applicable
Format your response in a clear, professional manner that would be helpful for someone researching this topic."""
try:
# Start timing
startTime = time.time()
# Use Perplexity for web research with search capabilities
perplexity_connector = modelRegistry.getConnectorForModel("perplexity_callAiWithWebSearch")
if not perplexity_connector:
raise ValueError("Perplexity connector not available")
response = await perplexity_connector.callAiWithWebSearch(webPrompt)
# Calculate timing and output bytes
endTime = time.time()
processingTime = endTime - startTime
outputBytes = len(response.encode("utf-8"))
# Calculate price using Perplexity model pricing
perplexity_model = modelRegistry.getModel("perplexity_callAiWithWebSearch")
priceUsd = perplexity_model.calculatePriceUsd(inputBytes, outputBytes)
logger.info(f"✅ Web query successful with Perplexity")
return AiCallResponse(
content=response,
modelName="perplexity_callAiWithWebSearch",
priceUsd=priceUsd,
processingTime=processingTime,
bytesSent=inputBytes,
bytesReceived=outputBytes,
errorCount=0
)
except Exception as e:
logger.error(f"Perplexity web query failed: {str(e)}")
return AiCallResponse(
content=f"Web query failed: {str(e)}",
modelName="perplexity_callAiWithWebSearch",
priceUsd=0.0,
processingTime=0.0,
bytesSent=inputBytes,
bytesReceived=0,
errorCount=1
)
# Utility methods
async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]:
"""List available models, optionally filtered by connector type."""
@ -1085,163 +664,8 @@ Format your response in a clear, professional manner that would be helpful for s
raise ValueError(f"Model {modelName} not found")
return model.model_dump()
async def getModelsByTag(self, tag: str) -> List[str]:
"""Get model names that have a specific tag."""
models = modelRegistry.getModelsByTag(tag)
return [model.name for model in models]
async def selectRelevantWebsites(self, websites: List[str], userQuestion: str) -> Tuple[List[str], str]:
"""Select most relevant websites using AI analysis. Returns (selected_websites, ai_response)."""
if len(websites) <= 1:
return websites, "Only one website available, no selection needed"
try:
# Create website summaries for AI analysis
websiteSummaries = []
for i, url in enumerate(websites, 1):
from urllib.parse import urlparse
domain = urlparse(url).netloc
summary = f"{i}. {url} (Domain: {domain})"
websiteSummaries.append(summary)
selectionPrompt = f"""
Based on this user request: "{userQuestion}"
I have {len(websites)} websites found. Please select the most relevant website(s) for this request.
Available websites:
{chr(10).join(websiteSummaries)}
Please respond with the website number(s) (1, 2, 3, etc.) that are most relevant.
Format: 1,3,5 (or just 1 for single selection)
"""
# Use Perplexity to select the best websites
response = await self.webQuery(selectionPrompt)
# Parse the selection
import re
numbers = re.findall(r'\d+', response)
if numbers:
selectedWebsites = []
for num in numbers:
index = int(num) - 1
if 0 <= index < len(websites):
selectedWebsites.append(websites[index])
if selectedWebsites:
logger.info(f"AI selected {len(selectedWebsites)} websites")
return selectedWebsites, response
# Fallback to first website
logger.warning("AI selection failed, using first website")
return websites[:1], f"AI selection failed, fallback to first website. AI response: {response}"
except Exception as e:
logger.error(f"Error in website selection: {str(e)}")
return websites[:1], f"Error in website selection: {str(e)}"
async def analyzeContentWithChunking(self, allContent: Dict[str, str], userQuestion: str) -> str:
"""Analyze content using AI with chunking for large content."""
logger.info(f"Analyzing {len(allContent)} websites with AI")
# Process content in chunks to avoid token limits
chunkSize = 50000 # 50k chars per chunk
allChunks = []
for url, content in allContent.items():
filteredContent = self._filterContent(content)
if len(filteredContent) <= chunkSize:
allChunks.append((url, filteredContent))
logger.info(f"Content from {url}: {len(filteredContent)} chars (single chunk)")
else:
# Split large content into chunks
chunkCount = (len(filteredContent) + chunkSize - 1) // chunkSize
logger.info(f"Content from {url}: {len(filteredContent)} chars (split into {chunkCount} chunks)")
for i in range(0, len(filteredContent), chunkSize):
chunk = filteredContent[i:i+chunkSize]
chunkNum = i//chunkSize + 1
allChunks.append((f"{url} (part {chunkNum})", chunk))
logger.info(f"Processing {len(allChunks)} content chunks")
# Analyze each chunk
chunkAnalyses = []
for i, (url, chunk) in enumerate(allChunks, 1):
logger.info(f"Analyzing chunk {i}/{len(allChunks)}: {url}")
try:
analysisPrompt = f"""
Analyze this web content and extract relevant information for: {userQuestion}
Source: {url}
Content: {chunk}
Please extract key information relevant to the query.
"""
analysis = await self.webQuery(analysisPrompt)
chunkAnalyses.append(analysis)
logger.info(f"Chunk {i}/{len(allChunks)} analyzed successfully")
except Exception as e:
logger.error(f"Chunk {i}/{len(allChunks)} error: {e}")
# Combine all chunk analyses
if chunkAnalyses:
logger.info(f"Combining {len(chunkAnalyses)} chunk analyses")
combinedAnalysis = "\n\n".join(chunkAnalyses)
# Final synthesis
try:
logger.info("Performing final synthesis of all analyses")
synthesisPrompt = f"""
Based on these partial analyses, provide a comprehensive answer to: {userQuestion}
Partial analyses:
{combinedAnalysis}
Please provide a clear, well-structured answer to the query.
"""
finalAnalysis = await self.webQuery(synthesisPrompt)
logger.info("Final synthesis completed successfully")
return finalAnalysis
except Exception as e:
logger.error(f"Synthesis error: {e}")
return combinedAnalysis
else:
logger.error("No content could be analyzed")
return "No content could be analyzed"
def _filterContent(self, content: str) -> str:
"""Filter out navigation, ads, and other nonsense content."""
lines = content.split('\n')
filteredLines = []
for line in lines:
line = line.strip()
# Skip empty lines
if not line:
continue
# Skip navigation elements
if any(skip in line.lower() for skip in [
'toggle navigation', 'log in', 'sign up', 'cookies', 'privacy policy',
'terms of service', 'subscribe', 'newsletter', 'follow us', 'share this',
'advertisement', 'sponsored', 'banner', 'popup', 'modal'
]):
continue
# Skip image references without context
if line.startswith('![Image') and '](' in line:
continue
# Skip pure links without context
if line.startswith('[') and line.endswith(')') and '---' in line:
continue
# Keep meaningful content
if len(line) > 10: # Skip very short lines
filteredLines.append(line)
return '\n'.join(filteredLines)

View file

@ -7,7 +7,6 @@ from modules.aicore.aicorePluginTavily import WebResearchRequest, WebResearchRes
from modules.interfaces.interfaceAiObjects import AiObjects
from modules.services.serviceAi.subCoreAi import SubCoreAi
from modules.services.serviceAi.subDocumentProcessing import SubDocumentProcessing
from modules.services.serviceAi.subWebResearch import SubWebResearch
from modules.services.serviceAi.subDocumentGeneration import SubDocumentGeneration
@ -19,7 +18,6 @@ class AiService:
Manager delegates to specialized sub-modules:
- SubCoreAi: Core AI operations (readImage, generateImage, callAi, planning, text calls)
- SubDocumentProcessing: Document chunking, processing, and merging logic
- SubWebResearch: Web research and crawling functionality
- SubDocumentGeneration: Single-file and multi-file document generation
The main service acts as a coordinator:
@ -40,7 +38,6 @@ class AiService:
self._extractionService = None # Lazy initialization
self._coreAi = None # Lazy initialization
self._documentProcessor = None # Lazy initialization
self._webResearch = None # Lazy initialization
self._documentGenerator = None # Lazy initialization
@property
@ -69,13 +66,6 @@ class AiService:
self._documentProcessor = SubDocumentProcessing(self.services, self.aiObjects)
return self._documentProcessor
@property
def webResearchService(self):
"""Lazy initialization of web research service."""
if self._webResearch is None:
logger.info("Lazy initializing SubWebResearch...")
self._webResearch = SubWebResearch(self.services, self.aiObjects)
return self._webResearch
@property
def documentGenerator(self):
@ -127,11 +117,6 @@ class AiService:
await self._ensureAiObjectsInitialized()
return await self.coreAi.generateImage(prompt, size, quality, style, options)
# Web Research
async def webResearch(self, request: WebResearchRequest) -> WebResearchResult:
"""Perform web research using interface functions."""
await self._ensureAiObjectsInitialized()
return await self.webResearchService.webResearch(request)
# Core AI Methods - Delegating to SubCoreAi
async def callAiPlanning(

View file

@ -1,388 +0,0 @@
import logging
from typing import Optional
from modules.aicore.aicorePluginTavily import WebResearchRequest, WebResearchResult
from modules.shared.configuration import APP_CONFIG
logger = logging.getLogger(__name__)
class SubWebResearch:
"""Web research operations including search, crawling, and analysis."""
def __init__(self, services, aiObjects):
"""Initialize web research service.
Args:
services: Service center instance for accessing other services
aiObjects: Initialized AiObjects instance
"""
self.services = services
self.aiObjects = aiObjects
async def webResearch(self, request: WebResearchRequest) -> WebResearchResult:
"""Perform web research using interface functions."""
try:
logger.info(f"WEB RESEARCH STARTED")
logger.info(f"User Query: {request.user_prompt}")
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}")
# Global URL index to track all processed URLs across the entire research session
global_processed_urls = set()
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
if request.urls:
# Use provided URLs as initial main URLs
websites = request.urls
logger.info(f"Using provided URLs ({len(websites)}):")
for i, url in enumerate(websites, 1):
logger.info(f" {i}. {url}")
else:
# Use AI to determine main URLs based on user's intention
logger.info(f"AI analyzing user intent: '{request.user_prompt}'")
# Use AI to generate optimized Tavily search query and search parameters
query_optimizer_prompt = f"""You are a search query optimizer.
USER QUERY: {request.user_prompt}
Your task: Create a search query and parameters for the USER QUERY given.
RULES:
1. The search query MUST be related to the user query above
2. Extract key terms from the user query
3. Determine appropriate country/language based on the query context
4. Keep search query short (2-6 words)
Return ONLY this JSON format:
{{
"user_prompt": "search query based on user query above",
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
"language": "language_code_or_null",
"topic": "general|news|academic_or_null",
"time_range": "d|w|m|y_or_null",
"selection_strategy": "single|multiple|specific_page",
"selection_criteria": "what URLs to prioritize",
"expected_url_patterns": ["pattern1", "pattern2"],
"estimated_result_count": number
}}"""
# Get AI response for query optimization
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
ai_request = AiCallRequest(
prompt=query_optimizer_prompt,
options=AiCallOptions()
)
# Write web research query optimization prompt to debug file
self.services.utils.writeDebugFile(query_optimizer_prompt, "web_research_query_optimizer_prompt")
ai_response_obj = await self.aiObjects.call(ai_request)
ai_response = ai_response_obj.content
# Write web research query optimization response to debug file
self.services.utils.writeDebugFile(ai_response, "web_research_query_optimizer_response")
logger.debug(f"AI query optimizer response: {ai_response}")
# Parse AI response to extract search query
import json
try:
# Clean the response by removing markdown code blocks
cleaned_response = ai_response.strip()
if cleaned_response.startswith('```json'):
cleaned_response = cleaned_response[7:] # Remove ```json
if cleaned_response.endswith('```'):
cleaned_response = cleaned_response[:-3] # Remove ```
cleaned_response = cleaned_response.strip()
query_data = json.loads(cleaned_response)
search_query = query_data.get("user_prompt", request.user_prompt)
ai_country = query_data.get("country")
ai_language = query_data.get("language")
ai_topic = query_data.get("topic")
ai_time_range = query_data.get("time_range")
selection_strategy = query_data.get("selection_strategy", "multiple")
selection_criteria = query_data.get("selection_criteria", "relevant URLs")
expected_patterns = query_data.get("expected_url_patterns", [])
estimated_count = query_data.get("estimated_result_count", request.max_results)
logger.info(f"AI optimized search query: '{search_query}'")
logger.info(f"Selection strategy: {selection_strategy}")
logger.info(f"Selection criteria: {selection_criteria}")
logger.info(f"Expected URL patterns: {expected_patterns}")
logger.info(f"Estimated result count: {estimated_count}")
except json.JSONDecodeError:
logger.warning("Failed to parse AI response as JSON, using original query")
search_query = request.user_prompt
ai_country = None
ai_language = None
ai_topic = None
ai_time_range = None
selection_strategy = "multiple"
# Perform the web search with AI-determined parameters
search_kwargs = {
"query": search_query,
"max_results": request.max_results,
"search_depth": request.options.search_depth,
"auto_parameters": False # Use explicit parameters
}
# Add parameters only if they have valid values
def _normalizeCountry(c: Optional[str]) -> Optional[str]:
if not c:
return None
s = str(c).strip()
if not s or s.lower() in ['null', 'none', 'undefined']:
return None
# Map common codes to full English names when easy to do without extra deps
mapping = {
'ch': 'Switzerland', 'che': 'Switzerland',
'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany',
'at': 'Austria', 'aut': 'Austria',
'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States',
'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom'
}
key = s.lower()
if key in mapping:
return mapping[key]
# If looks like full name, capitalize first letter only (Tavily accepts English names)
return s
norm_ai_country = _normalizeCountry(ai_country)
norm_req_country = _normalizeCountry(request.options.country)
if norm_ai_country:
search_kwargs["country"] = norm_ai_country
elif norm_req_country:
search_kwargs["country"] = norm_req_country
if ai_language and ai_language not in ['null', '', 'none', 'undefined']:
search_kwargs["language"] = ai_language
elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']:
search_kwargs["language"] = request.options.language
if ai_topic and ai_topic in ['general', 'news', 'academic']:
search_kwargs["topic"] = ai_topic
elif request.options.topic and request.options.topic in ['general', 'news', 'academic']:
search_kwargs["topic"] = request.options.topic
if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']:
search_kwargs["time_range"] = ai_time_range
elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']:
search_kwargs["time_range"] = request.options.time_range
# Constrain by expected domains if provided by AI
try:
include_domains = []
for p in expected_patterns or []:
if not isinstance(p, str):
continue
# Extract bare domain from pattern or URL
import re
m = re.search(r"(?:https?://)?([^/\s]+)", p.strip())
if m:
domain = m.group(1).lower()
# strip leading www.
if domain.startswith('www.'):
domain = domain[4:]
include_domains.append(domain)
# Deduplicate
if include_domains:
seen = set()
uniq = []
for d in include_domains:
if d not in seen:
seen.add(d)
uniq.append(d)
search_kwargs["include_domains"] = uniq
except Exception:
pass
# Log the parameters being used
logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}")
search_results = await self.aiObjects.search_websites(**search_kwargs)
logger.debug(f"Web search returned {len(search_results)} results:")
for i, result in enumerate(search_results, 1):
logger.debug(f" {i}. {result.url} - {result.title}")
# Deduplicate while preserving order
seen = set()
search_urls = []
for r in search_results:
u = str(r.url)
if u not in seen:
seen.add(u)
search_urls.append(u)
logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results")
if not search_urls:
logger.error("No relevant websites found")
return WebResearchResult(success=False, error="No relevant websites found")
# Now use AI to determine the main URLs based on user's intention
logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent")
# Create a prompt for AI to identify main URLs based on user's intention
ai_prompt = f"""
Select the most relevant URLs from these search results:
{chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])}
Return only the URLs that are most relevant for the user's query.
One URL per line.
"""
# Create AI call request
ai_request = AiCallRequest(
prompt=ai_prompt,
options=AiCallOptions()
)
# Write web research URL selection prompt to debug file
self.services.utils.writeDebugFile(ai_prompt, "web_research_url_selection_prompt")
ai_response_obj = await self.aiObjects.call(ai_request)
ai_response = ai_response_obj.content
# Write web research URL selection response to debug file
self.services.utils.writeDebugFile(ai_response, "web_research_url_selection_response")
logger.debug(f"AI response for main URL selection: {ai_response}")
# Parse AI response to extract URLs
websites = []
for line in ai_response.strip().split('\n'):
line = line.strip()
if line and ('http://' in line or 'https://' in line):
# Extract URL from the line
for word in line.split():
if word.startswith('http://') or word.startswith('https://'):
websites.append(word.rstrip('.,;'))
break
if not websites:
logger.warning("AI did not identify any main URLs, using first few search results")
websites = search_urls[:3] # Fallback to first 3 search results
# Deduplicate while preserving order
seen = set()
unique_websites = []
for url in websites:
if url not in seen:
seen.add(url)
unique_websites.append(url)
websites = unique_websites
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs")
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
for i, url in enumerate(websites, 1):
logger.info(f" {i}. {url}")
# Step 2: Smart website selection using AI interface
logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")
selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt)
logger.debug(f"AI Response: {aiResponse}")
logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:")
for i, url in enumerate(selectedWebsites, 1):
logger.debug(f" {i}. {url}")
# Show which were filtered out
filtered_out = [url for url in websites if url not in selectedWebsites]
if filtered_out:
logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:")
for i, url in enumerate(filtered_out, 1):
logger.debug(f" {i}. {url}")
# Step 3+4+5: Recursive crawling with configurable depth
# Get configuration parameters
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
crawl_timeout_seconds = crawl_timeout_minutes * 60
# Use the configured max_depth or the request's pages_search_depth, whichever is smaller
effective_depth = min(max_depth, request.options.pages_search_depth)
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===")
logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...")
logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})")
logger.info(f"Max links per domain: {max_links_per_domain}")
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
# Use recursive crawling with URL index to avoid duplicates
import asyncio
try:
allContent = await asyncio.wait_for(
self.aiObjects.crawlRecursively(
urls=selectedWebsites,
max_depth=effective_depth,
extract_depth=request.options.extract_depth,
max_per_domain=max_links_per_domain,
global_processed_urls=global_processed_urls
),
timeout=crawl_timeout_seconds
)
logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled")
except asyncio.TimeoutError:
logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results")
# crawlRecursively now handles timeouts gracefully and returns partial results
# Try to get the partial results that were collected
allContent = {}
if not allContent:
logger.error("Could not extract content from any websites")
return WebResearchResult(success=False, error="Could not extract content from any websites")
logger.info(f"=== WEB RESEARCH COMPLETED ===")
logger.info(f"Successfully crawled {len(allContent)} URLs total")
logger.info(f"Crawl depth: {effective_depth} levels")
# Create simple result with raw content
sources = [{"title": url, "url": url} for url in selectedWebsites]
# Get all additional links (all URLs except main ones)
additional_links = [url for url in allContent.keys() if url not in selectedWebsites]
# Combine all content into a single result
combinedContent = ""
for url, content in allContent.items():
combinedContent += f"\n\n=== {url} ===\n{content}\n"
# Create simplified document structure
document = {
"documentName": f"webResearch_{request.user_prompt[:50]}.json",
"documentData": {
"user_prompt": request.user_prompt,
"analysis_result": combinedContent,
"sources": sources,
"additional_links": additional_links,
"metadata": {
"websites_analyzed": len(allContent),
"additional_links_found": len(additional_links),
"crawl_depth": effective_depth,
"max_configured_depth": max_depth,
"max_links_per_domain": max_links_per_domain,
"crawl_timeout_minutes": crawl_timeout_minutes,
"total_urls_crawled": len(allContent),
"main_urls": len(selectedWebsites),
"additional_urls": len(additional_links)
}
},
"mimeType": "application/json"
}
return WebResearchResult(
success=True,
documents=[document]
)
except Exception as e:
logger.error(f"Error in web research: {str(e)}")
return WebResearchResult(success=False, error=str(e))

View file

@ -10,7 +10,7 @@ from datetime import datetime, UTC
from modules.workflows.methods.methodBase import MethodBase, action
from modules.datamodels.datamodelChat import ActionResult
from modules.datamodels.datamodelAi import AiCallOptions
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
from modules.datamodels.datamodelChat import ChatDocument
from modules.aicore.aicorePluginTavily import WebResearchRequest
@ -28,6 +28,7 @@ class MethodAi(MethodBase):
"""Format current timestamp as YYYYMMDD-hhmmss for filenames."""
return datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
@action
async def process(self, parameters: Dict[str, Any]) -> ActionResult:
"""
@ -161,93 +162,512 @@ class MethodAi(MethodBase):
error=str(e)
)
@action
async def webSearch(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Search the web and return a list of relevant URLs only.
- Input requirements: searchPrompt (required); optional maxResults, timeRange, country, language.
- Output format: JSON with search results and URLs.
Parameters:
- searchPrompt (str, required): Natural language search prompt describing what to search for.
- maxResults (int, optional): Maximum number of search results. Default: 5.
- timeRange (str, optional): d | w | m | y for time filtering.
- country (str, optional): Country name for localized results.
- language (str, optional): Language code (e.g., de, en, fr).
"""
try:
searchPrompt = parameters.get("searchPrompt")
if not searchPrompt:
return ActionResult.isFailure(error="Search prompt is required")
# Extract optional parameters
maxResults = parameters.get("maxResults", 5)
timeRange = parameters.get("timeRange")
country = parameters.get("country")
language = parameters.get("language")
# Build AI call options for web search
options = AiCallOptions(
operationType=OperationTypeEnum.WEB_SEARCH,
resultFormat="json"
)
# Create unified prompt JSON that both Tavily and Perplexity can understand
promptData = {
"searchPrompt": searchPrompt,
"maxResults": maxResults,
"timeRange": timeRange,
"country": country,
"language": language,
"instructions": "Search the web and return a JSON response with a 'results' array containing objects with 'title', 'url', and optionally 'content' fields. Focus on finding relevant URLs for the search prompt."
}
import json
prompt = json.dumps(promptData, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
prompt=prompt,
documents=None,
options=options,
outputFormat="json"
)
# Process result to ensure consistent format
processedResult = self._processWebSearchResult(result)
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_search",
extension="json",
action_name="search"
)
from modules.datamodels.datamodelChat import ActionDocument
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=processedResult,
mimeType="application/json"
)
return ActionResult.isSuccess(documents=[actionDocument])
except Exception as e:
logger.error(f"Error in web search: {str(e)}")
return ActionResult.isFailure(error=str(e))
def _processWebSearchResult(self, result: str) -> str:
"""
Process web search result to ensure consistent JSON format with URL list.
Both Tavily and Perplexity now return proper JSON format.
"""
try:
import json
data = json.loads(result)
# If it's already a proper search result format, return as-is
if isinstance(data, dict) and "results" in data:
return result
# If it's a different JSON format, try to extract URLs
if isinstance(data, dict):
# Look for URL patterns in the JSON
urls = self._extractUrlsFromJson(data)
if urls:
processedData = {
"query": data.get("query", "web search"),
"results": [{"title": f"Result {i+1}", "url": url} for i, url in enumerate(urls)],
"total_count": len(urls)
}
return json.dumps(processedData, indent=2)
# No URLs found, return original result in a structured format
processedData = {
"query": "web search",
"results": [],
"total_count": 0,
"raw_response": result
}
return json.dumps(processedData, indent=2)
except Exception as e:
logger.warning(f"Error processing web search result: {str(e)}")
# Return original result wrapped in error format
errorData = {
"query": "web search",
"results": [],
"total_count": 0,
"error": f"Failed to process result: {str(e)}",
"raw_response": result
}
return json.dumps(errorData, indent=2)
def _extractUrlsFromJson(self, data: Dict[str, Any]) -> List[str]:
"""Extract URLs from JSON data structure."""
urls = []
def _extractFromValue(value):
if isinstance(value, str):
# Check if it's a URL
if value.startswith(('http://', 'https://')):
urls.append(value)
elif isinstance(value, dict):
for v in value.values():
_extractFromValue(v)
elif isinstance(value, list):
for item in value:
_extractFromValue(item)
_extractFromValue(data)
return list(set(urls)) # Remove duplicates
@action
async def webCrawl(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Extract content from specific URLs.
- Input requirements: urls (required); optional extractDepth, format.
- Output format: JSON with extracted content from URLs.
Parameters:
- urls (list, required): List of URLs to crawl and extract content from.
- extractDepth (str, optional): basic | advanced. Default: advanced.
- format (str, optional): markdown | html | text. Default: markdown.
"""
try:
urls = parameters.get("urls")
if not urls or not isinstance(urls, list):
return ActionResult.isFailure(error="URLs list is required")
# Extract optional parameters
extractDepth = parameters.get("extractDepth", "advanced")
formatType = parameters.get("format", "markdown")
# Build AI call options for web crawling
options = AiCallOptions(
operationType=OperationTypeEnum.WEB_CRAWL,
resultFormat="json"
)
# Create unified prompt JSON for web crawling
promptData = {
"urls": urls,
"extractDepth": extractDepth,
"format": formatType,
"instructions": "Extract content from the provided URLs and return a JSON response with 'results' array containing objects with 'url', 'title', 'content', and 'extractedAt' fields."
}
import json
prompt = json.dumps(promptData, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
prompt=prompt,
documents=None,
options=options,
outputFormat="json"
)
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_crawl",
extension="json",
action_name="crawl"
)
from modules.datamodels.datamodelChat import ActionDocument
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=result,
mimeType="application/json"
)
return ActionResult.isSuccess(documents=[actionDocument])
except Exception as e:
logger.error(f"Error in web crawl: {str(e)}")
return ActionResult.isFailure(error=str(e))
@action
async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Web research and information gathering with basic analysis and sources.
- Input requirements: user_prompt (required); optional urls, max_results, max_pages, search_depth, extract_depth, pages_search_depth, country, time_range, topic, language.
- Output format: JSON with results and sources.
- Purpose: Comprehensive web research combining search and content extraction.
- Input requirements: researchPrompt (required); optional maxResults, urls, timeRange, country, language.
- Output format: JSON with research results, sources, and analysis.
Parameters:
- user_prompt (str, required): Research question or topic.
- urls (list, optional): Specific URLs to crawl.
- max_results (int, optional): Max search results. Default: 5.
- max_pages (int, optional): Max pages to crawl per site. Default: 5.
- extract_depth (str, optional): basic | advanced. Default: advanced.
- search_depth (int, optional): Crawl depth level - how many times to follow sublinks of a page. Default: 2.
- country (str, optional): Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries).
- time_range (str, optional): d | w | m | y.
- topic (str, optional): general | news | academic.
- researchPrompt (str, required): Natural language research prompt describing what to research.
- maxResults (int, optional): Maximum search results. Default: 5.
- urls (list, optional): Specific URLs to include in research.
- timeRange (str, optional): d | w | m | y for time filtering.
- country (str, optional): Country name for localized results.
- language (str, optional): Language code (e.g., de, en, fr).
"""
try:
user_prompt = parameters.get("user_prompt")
researchPrompt = parameters.get("researchPrompt")
if not researchPrompt:
return ActionResult.isFailure(error="Research prompt is required")
# Extract optional parameters
maxResults = parameters.get("maxResults", 5)
urls = parameters.get("urls")
max_results = parameters.get("max_results", 5)
max_pages = parameters.get("max_pages", 5)
extract_depth = parameters.get("extract_depth", "advanced")
search_depth = parameters.get("pages_search_depth", 2)
timeRange = parameters.get("timeRange")
country = parameters.get("country")
time_range = parameters.get("time_range")
topic = parameters.get("topic")
language = parameters.get("language")
if not user_prompt:
return ActionResult.isFailure(
error="Search query is required"
# Build AI call options for web research
options = AiCallOptions(
operationType=OperationTypeEnum.WEB_RESEARCH,
resultFormat="json"
)
# Build WebResearchRequest (simplified dataclass)
request = WebResearchRequest(
user_prompt=user_prompt,
urls=urls,
max_results=max_results,
max_pages=max_pages,
search_depth=search_depth,
extract_depth=extract_depth,
country=country,
time_range=time_range,
topic=topic,
language=language
# Create unified prompt JSON for web research
promptData = {
"researchPrompt": researchPrompt,
"maxResults": maxResults,
"urls": urls,
"timeRange": timeRange,
"country": country,
"language": language,
"instructions": "Conduct comprehensive web research and return a JSON response with 'results' array containing objects with 'title', 'url', 'content', and 'analysis' fields. Provide detailed analysis and insights."
}
import json
prompt = json.dumps(promptData, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
prompt=prompt,
documents=None,
options=options,
outputFormat="json"
)
# Call web research service
logger.info(f"Performing comprehensive web research for: {user_prompt}")
logger.info(f"Max results: {max_results}, Max pages: {max_pages}")
if urls:
logger.info(f"Using provided URLs: {len(urls)}")
result = await self.services.ai.webResearch(request)
if not result.success:
return ActionResult.isFailure(error=result.error)
# Convert WebResearchResult to ActionResult format
documents = []
for doc in result.documents:
documents.append({
"documentName": doc.documentName,
"documentData": {
"user_prompt": doc.documentData.user_prompt,
"websites_analyzed": doc.documentData.websites_analyzed,
"additional_links_found": doc.documentData.additional_links_found,
"analysis_result": doc.documentData.analysis_result,
"sources": [{"title": s.title, "url": str(s.url)} for s in doc.documentData.sources],
"additional_links": doc.documentData.additional_links,
"debug_info": doc.documentData.debug_info
},
"mimeType": doc.mimeType
})
# Return result in the standard ActionResult format
return ActionResult.isSuccess(
documents=documents
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_research",
extension="json",
action_name="research"
)
from modules.datamodels.datamodelChat import ActionDocument
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=result,
mimeType="application/json"
)
return ActionResult.isSuccess(documents=[actionDocument])
except Exception as e:
logger.error(f"Error in web research: {str(e)}")
return ActionResult.isFailure(
error=str(e)
return ActionResult.isFailure(error=str(e))
@action
async def webQuestions(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Answer questions using web research and AI analysis.
- Input requirements: question (required); optional context, maxResults, timeRange, country, language.
- Output format: JSON with question answer and supporting sources.
Parameters:
- question (str, required): Question to be answered using web research.
- context (str, optional): Additional context for the question.
- maxResults (int, optional): Maximum search results. Default: 5.
- timeRange (str, optional): d | w | m | y for time filtering.
- country (str, optional): Country name for localized results.
- language (str, optional): Language code (e.g., de, en, fr).
"""
try:
question = parameters.get("question")
if not question:
return ActionResult.isFailure(error="Question is required")
# Extract optional parameters
context = parameters.get("context", "")
maxResults = parameters.get("maxResults", 5)
timeRange = parameters.get("timeRange")
country = parameters.get("country")
language = parameters.get("language")
# Build AI call options for web questions
options = AiCallOptions(
operationType=OperationTypeEnum.WEB_QUESTIONS,
resultFormat="json"
)
# Create unified prompt JSON for web questions
promptData = {
"question": question,
"context": context,
"maxResults": maxResults,
"timeRange": timeRange,
"country": country,
"language": language,
"instructions": "Answer the question using web research and return a JSON response with 'answer', 'sources' array containing objects with 'title', 'url', 'content', and 'relevance' fields."
}
import json
prompt = json.dumps(promptData, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
prompt=prompt,
documents=None,
options=options,
outputFormat="json"
)
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_questions",
extension="json",
action_name="questions"
)
from modules.datamodels.datamodelChat import ActionDocument
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=result,
mimeType="application/json"
)
return ActionResult.isSuccess(documents=[actionDocument])
except Exception as e:
logger.error(f"Error in web questions: {str(e)}")
return ActionResult.isFailure(error=str(e))
@action
async def webNews(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Search and analyze news articles on specific topics.
- Input requirements: newsPrompt (required); optional maxResults, timeRange, country, language.
- Output format: JSON with news articles, summaries, and analysis.
Parameters:
- newsPrompt (str, required): Natural language prompt describing what news to search for.
- maxResults (int, optional): Maximum news articles. Default: 5.
- timeRange (str, optional): d | w | m | y for time filtering. Default: w.
- country (str, optional): Country name for localized news.
- language (str, optional): Language code (e.g., de, en, fr).
"""
try:
newsPrompt = parameters.get("newsPrompt")
if not newsPrompt:
return ActionResult.isFailure(error="News prompt is required")
# Extract optional parameters
maxResults = parameters.get("maxResults", 5)
timeRange = parameters.get("timeRange", "w") # Default to week
country = parameters.get("country")
language = parameters.get("language")
# Build AI call options for web news
options = AiCallOptions(
operationType=OperationTypeEnum.WEB_NEWS,
resultFormat="json"
)
# Create unified prompt JSON for web news
promptData = {
"newsPrompt": newsPrompt,
"maxResults": maxResults,
"timeRange": timeRange,
"country": country,
"language": language,
"instructions": "Find and analyze recent news articles and return a JSON response with 'articles' array containing objects with 'title', 'url', 'content', 'date', 'source', and 'summary' fields."
}
import json
prompt = json.dumps(promptData, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
prompt=prompt,
documents=None,
options=options,
outputFormat="json"
)
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_news",
extension="json",
action_name="news"
)
from modules.datamodels.datamodelChat import ActionDocument
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=result,
mimeType="application/json"
)
return ActionResult.isSuccess(documents=[actionDocument])
except Exception as e:
logger.error(f"Error in web news: {str(e)}")
return ActionResult.isFailure(error=str(e))
@action
async def generateImage(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Generate images using AI based on text prompts.
- Input requirements: prompt (required); optional size, quality, style.
- Output format: Base64 encoded image data.
Parameters:
- prompt (str, required): Text description of the image to generate.
- size (str, optional): Image size. Options: 1024x1024, 1792x1024, 1024x1792. Default: 1024x1024.
- quality (str, optional): Image quality. Options: standard, hd. Default: standard.
- style (str, optional): Image style. Options: vivid, natural. Default: vivid.
"""
try:
prompt = parameters.get("prompt")
if not prompt:
return ActionResult.isFailure(error="Image prompt is required")
# Extract optional parameters
size = parameters.get("size", "1024x1024")
quality = parameters.get("quality", "standard")
style = parameters.get("style", "vivid")
# Build AI call options for image generation
options = AiCallOptions(
operationType=OperationTypeEnum.IMAGE_GENERATE,
resultFormat="base64"
)
# Create unified prompt JSON for image generation
promptData = {
"prompt": prompt,
"size": size,
"quality": quality,
"style": style,
"instructions": "Generate an image based on the prompt and return the base64 encoded image data."
}
import json
promptJson = json.dumps(promptData, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
prompt=promptJson,
documents=None,
options=options,
outputFormat="base64"
)
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="generated_image",
extension="png",
action_name="generate"
)
from modules.datamodels.datamodelChat import ActionDocument
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=result,
mimeType="image/png"
)
return ActionResult.isSuccess(documents=[actionDocument])
except Exception as e:
logger.error(f"Error in image generation: {str(e)}")
return ActionResult.isFailure(error=str(e))