389 lines
20 KiB
Python
389 lines
20 KiB
Python
import logging
|
|
from typing import Dict, Any, List, Optional, Tuple, Union
|
|
from modules.aicore.aicorePluginTavily import WebResearchRequest, WebResearchResult
|
|
from modules.interfaces.interfaceAiObjects import AiObjects
|
|
from modules.shared.configuration import APP_CONFIG
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SubWebResearch:
|
|
"""Web research operations including search, crawling, and analysis."""
|
|
|
|
def __init__(self, services, aiObjects):
|
|
"""Initialize web research service.
|
|
|
|
Args:
|
|
services: Service center instance for accessing other services
|
|
aiObjects: Initialized AiObjects instance
|
|
"""
|
|
self.services = services
|
|
self.aiObjects = aiObjects
|
|
|
|
async def webResearch(self, request: WebResearchRequest) -> WebResearchResult:
|
|
"""Perform web research using interface functions."""
|
|
try:
|
|
logger.info(f"WEB RESEARCH STARTED")
|
|
logger.info(f"User Query: {request.user_prompt}")
|
|
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}")
|
|
|
|
# Global URL index to track all processed URLs across the entire research session
|
|
global_processed_urls = set()
|
|
|
|
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
|
|
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
|
|
|
|
if request.urls:
|
|
# Use provided URLs as initial main URLs
|
|
websites = request.urls
|
|
logger.info(f"Using provided URLs ({len(websites)}):")
|
|
for i, url in enumerate(websites, 1):
|
|
logger.info(f" {i}. {url}")
|
|
else:
|
|
# Use AI to determine main URLs based on user's intention
|
|
logger.info(f"AI analyzing user intent: '{request.user_prompt}'")
|
|
|
|
# Use AI to generate optimized Tavily search query and search parameters
|
|
query_optimizer_prompt = f"""You are a search query optimizer.
|
|
|
|
USER QUERY: {request.user_prompt}
|
|
|
|
Your task: Create a search query and parameters for the USER QUERY given.
|
|
|
|
RULES:
|
|
1. The search query MUST be related to the user query above
|
|
2. Extract key terms from the user query
|
|
3. Determine appropriate country/language based on the query context
|
|
4. Keep search query short (2-6 words)
|
|
|
|
Return ONLY this JSON format:
|
|
{{
|
|
"user_prompt": "search query based on user query above",
|
|
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
|
|
"language": "language_code_or_null",
|
|
"topic": "general|news|academic_or_null",
|
|
"time_range": "d|w|m|y_or_null",
|
|
"selection_strategy": "single|multiple|specific_page",
|
|
"selection_criteria": "what URLs to prioritize",
|
|
"expected_url_patterns": ["pattern1", "pattern2"],
|
|
"estimated_result_count": number
|
|
}}"""
|
|
|
|
# Get AI response for query optimization
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
|
|
ai_request = AiCallRequest(
|
|
prompt=query_optimizer_prompt,
|
|
options=AiCallOptions()
|
|
)
|
|
|
|
# Write web research query optimization prompt to debug file
|
|
self.services.utils.writeDebugFile(query_optimizer_prompt, "web_research_query_optimizer_prompt")
|
|
|
|
ai_response_obj = await self.aiObjects.call(ai_request)
|
|
ai_response = ai_response_obj.content
|
|
|
|
# Write web research query optimization response to debug file
|
|
self.services.utils.writeDebugFile(ai_response, "web_research_query_optimizer_response")
|
|
logger.debug(f"AI query optimizer response: {ai_response}")
|
|
|
|
# Parse AI response to extract search query
|
|
import json
|
|
try:
|
|
# Clean the response by removing markdown code blocks
|
|
cleaned_response = ai_response.strip()
|
|
if cleaned_response.startswith('```json'):
|
|
cleaned_response = cleaned_response[7:] # Remove ```json
|
|
if cleaned_response.endswith('```'):
|
|
cleaned_response = cleaned_response[:-3] # Remove ```
|
|
cleaned_response = cleaned_response.strip()
|
|
|
|
query_data = json.loads(cleaned_response)
|
|
search_query = query_data.get("user_prompt", request.user_prompt)
|
|
ai_country = query_data.get("country")
|
|
ai_language = query_data.get("language")
|
|
ai_topic = query_data.get("topic")
|
|
ai_time_range = query_data.get("time_range")
|
|
selection_strategy = query_data.get("selection_strategy", "multiple")
|
|
selection_criteria = query_data.get("selection_criteria", "relevant URLs")
|
|
expected_patterns = query_data.get("expected_url_patterns", [])
|
|
estimated_count = query_data.get("estimated_result_count", request.max_results)
|
|
|
|
logger.info(f"AI optimized search query: '{search_query}'")
|
|
logger.info(f"Selection strategy: {selection_strategy}")
|
|
logger.info(f"Selection criteria: {selection_criteria}")
|
|
logger.info(f"Expected URL patterns: {expected_patterns}")
|
|
logger.info(f"Estimated result count: {estimated_count}")
|
|
|
|
except json.JSONDecodeError:
|
|
logger.warning("Failed to parse AI response as JSON, using original query")
|
|
search_query = request.user_prompt
|
|
ai_country = None
|
|
ai_language = None
|
|
ai_topic = None
|
|
ai_time_range = None
|
|
selection_strategy = "multiple"
|
|
|
|
# Perform the web search with AI-determined parameters
|
|
search_kwargs = {
|
|
"query": search_query,
|
|
"max_results": request.max_results,
|
|
"search_depth": request.options.search_depth,
|
|
"auto_parameters": False # Use explicit parameters
|
|
}
|
|
|
|
# Add parameters only if they have valid values
|
|
def _normalizeCountry(c: Optional[str]) -> Optional[str]:
|
|
if not c:
|
|
return None
|
|
s = str(c).strip()
|
|
if not s or s.lower() in ['null', 'none', 'undefined']:
|
|
return None
|
|
# Map common codes to full English names when easy to do without extra deps
|
|
mapping = {
|
|
'ch': 'Switzerland', 'che': 'Switzerland',
|
|
'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany',
|
|
'at': 'Austria', 'aut': 'Austria',
|
|
'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States',
|
|
'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom'
|
|
}
|
|
key = s.lower()
|
|
if key in mapping:
|
|
return mapping[key]
|
|
# If looks like full name, capitalize first letter only (Tavily accepts English names)
|
|
return s
|
|
|
|
norm_ai_country = _normalizeCountry(ai_country)
|
|
norm_req_country = _normalizeCountry(request.options.country)
|
|
if norm_ai_country:
|
|
search_kwargs["country"] = norm_ai_country
|
|
elif norm_req_country:
|
|
search_kwargs["country"] = norm_req_country
|
|
|
|
if ai_language and ai_language not in ['null', '', 'none', 'undefined']:
|
|
search_kwargs["language"] = ai_language
|
|
elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']:
|
|
search_kwargs["language"] = request.options.language
|
|
|
|
if ai_topic and ai_topic in ['general', 'news', 'academic']:
|
|
search_kwargs["topic"] = ai_topic
|
|
elif request.options.topic and request.options.topic in ['general', 'news', 'academic']:
|
|
search_kwargs["topic"] = request.options.topic
|
|
|
|
if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']:
|
|
search_kwargs["time_range"] = ai_time_range
|
|
elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']:
|
|
search_kwargs["time_range"] = request.options.time_range
|
|
|
|
# Constrain by expected domains if provided by AI
|
|
try:
|
|
include_domains = []
|
|
for p in expected_patterns or []:
|
|
if not isinstance(p, str):
|
|
continue
|
|
# Extract bare domain from pattern or URL
|
|
import re
|
|
m = re.search(r"(?:https?://)?([^/\s]+)", p.strip())
|
|
if m:
|
|
domain = m.group(1).lower()
|
|
# strip leading www.
|
|
if domain.startswith('www.'):
|
|
domain = domain[4:]
|
|
include_domains.append(domain)
|
|
# Deduplicate
|
|
if include_domains:
|
|
seen = set()
|
|
uniq = []
|
|
for d in include_domains:
|
|
if d not in seen:
|
|
seen.add(d)
|
|
uniq.append(d)
|
|
search_kwargs["include_domains"] = uniq
|
|
except Exception:
|
|
pass
|
|
|
|
# Log the parameters being used
|
|
logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}")
|
|
|
|
search_results = await self.aiObjects.search_websites(**search_kwargs)
|
|
|
|
logger.debug(f"Web search returned {len(search_results)} results:")
|
|
for i, result in enumerate(search_results, 1):
|
|
logger.debug(f" {i}. {result.url} - {result.title}")
|
|
|
|
# Deduplicate while preserving order
|
|
seen = set()
|
|
search_urls = []
|
|
for r in search_results:
|
|
u = str(r.url)
|
|
if u not in seen:
|
|
seen.add(u)
|
|
search_urls.append(u)
|
|
|
|
logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results")
|
|
|
|
if not search_urls:
|
|
logger.error("No relevant websites found")
|
|
return WebResearchResult(success=False, error="No relevant websites found")
|
|
|
|
# Now use AI to determine the main URLs based on user's intention
|
|
logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent")
|
|
|
|
# Create a prompt for AI to identify main URLs based on user's intention
|
|
ai_prompt = f"""
|
|
Select the most relevant URLs from these search results:
|
|
|
|
{chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])}
|
|
|
|
Return only the URLs that are most relevant for the user's query.
|
|
One URL per line.
|
|
"""
|
|
# Create AI call request
|
|
ai_request = AiCallRequest(
|
|
prompt=ai_prompt,
|
|
options=AiCallOptions()
|
|
)
|
|
|
|
# Write web research URL selection prompt to debug file
|
|
self.services.utils.writeDebugFile(ai_prompt, "web_research_url_selection_prompt")
|
|
|
|
ai_response_obj = await self.aiObjects.call(ai_request)
|
|
ai_response = ai_response_obj.content
|
|
|
|
# Write web research URL selection response to debug file
|
|
self.services.utils.writeDebugFile(ai_response, "web_research_url_selection_response")
|
|
logger.debug(f"AI response for main URL selection: {ai_response}")
|
|
|
|
# Parse AI response to extract URLs
|
|
websites = []
|
|
for line in ai_response.strip().split('\n'):
|
|
line = line.strip()
|
|
if line and ('http://' in line or 'https://' in line):
|
|
# Extract URL from the line
|
|
for word in line.split():
|
|
if word.startswith('http://') or word.startswith('https://'):
|
|
websites.append(word.rstrip('.,;'))
|
|
break
|
|
|
|
if not websites:
|
|
logger.warning("AI did not identify any main URLs, using first few search results")
|
|
websites = search_urls[:3] # Fallback to first 3 search results
|
|
|
|
# Deduplicate while preserving order
|
|
seen = set()
|
|
unique_websites = []
|
|
for url in websites:
|
|
if url not in seen:
|
|
seen.add(url)
|
|
unique_websites.append(url)
|
|
|
|
websites = unique_websites
|
|
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs")
|
|
|
|
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
|
|
for i, url in enumerate(websites, 1):
|
|
logger.info(f" {i}. {url}")
|
|
|
|
# Step 2: Smart website selection using AI interface
|
|
logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===")
|
|
logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'")
|
|
|
|
selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt)
|
|
|
|
logger.debug(f"AI Response: {aiResponse}")
|
|
logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:")
|
|
for i, url in enumerate(selectedWebsites, 1):
|
|
logger.debug(f" {i}. {url}")
|
|
|
|
# Show which were filtered out
|
|
filtered_out = [url for url in websites if url not in selectedWebsites]
|
|
if filtered_out:
|
|
logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:")
|
|
for i, url in enumerate(filtered_out, 1):
|
|
logger.debug(f" {i}. {url}")
|
|
|
|
# Step 3+4+5: Recursive crawling with configurable depth
|
|
# Get configuration parameters
|
|
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
|
|
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
|
|
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
|
|
crawl_timeout_seconds = crawl_timeout_minutes * 60
|
|
|
|
# Use the configured max_depth or the request's pages_search_depth, whichever is smaller
|
|
effective_depth = min(max_depth, request.options.pages_search_depth)
|
|
|
|
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===")
|
|
logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...")
|
|
logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})")
|
|
logger.info(f"Max links per domain: {max_links_per_domain}")
|
|
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
|
|
|
|
# Use recursive crawling with URL index to avoid duplicates
|
|
import asyncio
|
|
try:
|
|
allContent = await asyncio.wait_for(
|
|
self.aiObjects.crawlRecursively(
|
|
urls=selectedWebsites,
|
|
max_depth=effective_depth,
|
|
extract_depth=request.options.extract_depth,
|
|
max_per_domain=max_links_per_domain,
|
|
global_processed_urls=global_processed_urls
|
|
),
|
|
timeout=crawl_timeout_seconds
|
|
)
|
|
logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled")
|
|
except asyncio.TimeoutError:
|
|
logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results")
|
|
# crawlRecursively now handles timeouts gracefully and returns partial results
|
|
# Try to get the partial results that were collected
|
|
allContent = {}
|
|
|
|
if not allContent:
|
|
logger.error("Could not extract content from any websites")
|
|
return WebResearchResult(success=False, error="Could not extract content from any websites")
|
|
|
|
logger.info(f"=== WEB RESEARCH COMPLETED ===")
|
|
logger.info(f"Successfully crawled {len(allContent)} URLs total")
|
|
logger.info(f"Crawl depth: {effective_depth} levels")
|
|
|
|
# Create simple result with raw content
|
|
sources = [{"title": url, "url": url} for url in selectedWebsites]
|
|
|
|
# Get all additional links (all URLs except main ones)
|
|
additional_links = [url for url in allContent.keys() if url not in selectedWebsites]
|
|
|
|
# Combine all content into a single result
|
|
combinedContent = ""
|
|
for url, content in allContent.items():
|
|
combinedContent += f"\n\n=== {url} ===\n{content}\n"
|
|
|
|
# Create simplified document structure
|
|
document = {
|
|
"documentName": f"webResearch_{request.user_prompt[:50]}.json",
|
|
"documentData": {
|
|
"user_prompt": request.user_prompt,
|
|
"analysis_result": combinedContent,
|
|
"sources": sources,
|
|
"additional_links": additional_links,
|
|
"metadata": {
|
|
"websites_analyzed": len(allContent),
|
|
"additional_links_found": len(additional_links),
|
|
"crawl_depth": effective_depth,
|
|
"max_configured_depth": max_depth,
|
|
"max_links_per_domain": max_links_per_domain,
|
|
"crawl_timeout_minutes": crawl_timeout_minutes,
|
|
"total_urls_crawled": len(allContent),
|
|
"main_urls": len(selectedWebsites),
|
|
"additional_urls": len(additional_links)
|
|
}
|
|
},
|
|
"mimeType": "application/json"
|
|
}
|
|
|
|
return WebResearchResult(
|
|
success=True,
|
|
documents=[document]
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in web research: {str(e)}")
|
|
return WebResearchResult(success=False, error=str(e))
|