gateway/modules/chat_agent_webcrawler.py
2025-04-21 17:44:28 +02:00

769 lines
No EOL
30 KiB
Python

"""
Webcrawler agent for research and retrieval of information from the web.
Optimized for the new task-based processing.
"""
import logging
import json
import re
import time
from typing import Dict, Any, List
from urllib.parse import quote_plus, unquote
from bs4 import BeautifulSoup
import requests
import markdown
from modules.chat_registry import AgentBase
from modules.configuration import APP_CONFIG
logger = logging.getLogger(__name__)
class AgentWebcrawler(AgentBase):
"""Agent for web research and information retrieval"""
def __init__(self):
"""Initialize the webcrawler agent"""
super().__init__()
self.name = "webcrawler"
self.description = "Conducts web research and collects information from online sources"
self.capabilities = [
"web_search",
"information_retrieval",
"data_collection",
"search_results_analysis",
"webpage_content_extraction"
]
# Web crawling configuration
self.max_url = int(APP_CONFIG.get("Agent_Webcrawler_MAX_URLS", "5"))
self.max_key = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_KEYWORDS", "3"))
self.max_result = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_RESULTS", "5"))
self.timeout = int(APP_CONFIG.get("Agent_Webcrawler_TIMEOUT", "30"))
def set_dependencies(self, ai_service=None):
"""Set external dependencies for the agent."""
self.ai_service = ai_service
async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
"""
Process a standardized task structure and conduct web research.
Args:
task: A dictionary containing:
- task_id: Unique ID for this task
- prompt: The main instruction for the agent
- input_documents: List of documents to process
- output_specifications: List of required output documents
- context: Additional contextual information
Returns:
A dictionary containing:
- feedback: Text response explaining the research results
- documents: List of created document objects
"""
try:
# Extract relevant task information
prompt = task.get("prompt", "")
output_specs = task.get("output_specifications", [])
# Check if AI service is available
if not self.ai_service:
logger.error("No AI service configured for the Webcrawler agent")
return {
"feedback": "The Webcrawler agent is not properly configured.",
"documents": []
}
# Check if this is a web research request
is_web_research = await self._is_web_research_request(prompt)
if not is_web_research:
logger.info("Request rejected: not a web research task")
return {
"feedback": "This request doesn't appear to require web research.",
"documents": []
}
# Proceed with web research
logger.info(f"Web research for: {prompt[:50]}...")
# Create search strategy
search_strategy = await self._create_search_strategy(prompt)
search_keys = search_strategy.get("skey", [])
search_urls = search_strategy.get("url", [])
if search_keys:
logger.info(f"Searching for {len(search_keys)} key terms: {', '.join(search_keys[:2])}...")
if search_urls:
logger.info(f"Searching in {len(search_urls)} direct URLs: {', '.join(search_urls[:2])}...")
# Execute search
results = []
# Process search terms
for keyword in search_keys:
logger.info(f"Searching the web for: '{keyword}'")
keyword_results = self._search_web(keyword)
results.extend(keyword_results)
logger.info(f"Found: {len(keyword_results)} results for '{keyword}'")
# Process direct URLs
for url in search_urls:
logger.info(f"Extracting content from: {url}")
soup = self._read_url(url)
# Extract title from the page, if available
title = self._extract_title(soup, url)
result = self._parse_result(soup, title, url)
results.append(result)
logger.info(f"Extracted: '{title}' from {url}")
# Process results for final output
logger.info(f"Analyzing {len(results)} web results")
# Generate summaries for each result
processed_results = []
for i, result in enumerate(results):
result_data_limited = self._limit_text(result['data'], max_chars=10000)
logger.info(f"Analyzing result {i+1}/{len(results)}: {result['title'][:30]}...")
# No AI service available, create minimal summary
if not self.ai_service:
content_summary = f"Extract from {result['url']} ({len(result_data_limited)} characters)"
else:
# Generate summary with AI
content_summary = await self._summarize_result(result_data_limited, prompt)
processed_result = {
"title": result['title'],
"url": result['url'],
"snippet": result['snippet'],
"summary": content_summary
}
processed_results.append(processed_result)
# Create overall summary
all_summaries = "\n\n".join([r["summary"] for r in processed_results])
all_summaries_limited = self._limit_text(all_summaries, max_chars=10000)
logger.info("Creating overall summary of web research")
if not self.ai_service:
final_summary = f"Summary of {len(processed_results)} web research results"
else:
final_summary = await self.ai_service.call_api([
{"role": "system", "content": "You create concise summaries of research results."},
{"role": "user", "content": f"Please summarize these findings in 5-6 sentences: {all_summaries_limited}\n"}
])
# Get localized headers for output
headers = await self._get_localized_headers(prompt)
# Create document objects based on output specifications
generated_documents = []
# Generate appropriate document for each requested output
for spec in output_specs:
output_label = spec.get("label", "")
output_description = spec.get("description", "")
# Determine output format based on file extension
format_type = self._determine_format_type(output_label)
# Generate content based on format and requirements
if format_type == "markdown" or format_type == "text":
content = self._format_results_as_markdown(processed_results, final_summary, headers)
elif format_type == "html":
md_content = self._format_results_as_markdown(processed_results, final_summary, headers)
content = markdown.markdown(md_content)
elif format_type == "json":
content = json.dumps({
"summary": final_summary,
"results": processed_results
}, indent=2, ensure_ascii=False)
elif format_type == "csv":
csv_lines = ["Title,URL,Snippet"]
for result in processed_results:
# Escape commas and quotes in fields
title = result["title"].replace('"', '""')
url = result["url"].replace('"', '""')
snippet = result["snippet"].replace('"', '""')
csv_line = f'"{title}","{url}","{snippet}"'
csv_lines.append(csv_line)
content = "\n".join(csv_lines)
else:
# Default: Markdown
content = self._format_results_as_markdown(processed_results, final_summary, headers)
# Add document to results list
generated_documents.append({
"label": output_label,
"content": content
})
# If no specific outputs requested, return standard document
if not output_specs:
content = self._format_results_as_markdown(processed_results, final_summary, headers)
generated_documents.append({
"label": "web_research_results.md",
"content": content
})
# Create feedback for response
feedback = f"I conducted web research on '{prompt[:50]}...' and found {len(processed_results)} relevant results."
logger.info("Web research completed successfully")
return {
"feedback": feedback,
"documents": generated_documents
}
except Exception as e:
error_msg = f"Error during web research: {str(e)}"
logger.error(error_msg)
return {
"feedback": f"An error occurred during the web research: {str(e)}",
"documents": []
}
def _determine_format_type(self, output_label: str) -> str:
"""
Determine the format type based on the filename.
Args:
output_label: Output filename
Returns:
Format type (markdown, html, text, json, csv)
"""
output_label_lower = output_label.lower()
if output_label_lower.endswith(".md"):
return "markdown"
elif output_label_lower.endswith(".html"):
return "html"
elif output_label_lower.endswith(".txt"):
return "text"
elif output_label_lower.endswith(".json"):
return "json"
elif output_label_lower.endswith(".csv"):
return "csv"
else:
# Default to markdown
return "markdown"
def _format_results_as_markdown(self, results: List[Dict[str, Any]],
summary: str, headers: Dict[str, str]) -> str:
"""
Format research results as markdown.
Args:
results: List of results
summary: Summary of all results
headers: Localized headers
Returns:
Formatted markdown text
"""
md_content = f"# {headers['web_research_results']}\n\n"
md_content += f"## {headers['summary']}\n\n{summary}\n\n"
if results:
md_content += f"## {headers['detailed_results']}\n\n"
for i, result in enumerate(results, 1):
md_content += f"### {i}. {result['title']}\n\n"
md_content += f"**{headers['url']}**: {result['url']}\n\n"
md_content += f"**{headers['snippet']}**: {result['snippet']}\n\n"
md_content += f"**{headers['content']}**: {result['summary']}\n\n"
# Add separator between results (except for the last one)
if i < len(results):
md_content += "---\n\n"
return md_content
async def _is_web_research_request(self, prompt: str) -> bool:
"""
Use AI to determine if a request requires web research.
Args:
prompt: The user request
Returns:
True if it is explicitly a web research request, False otherwise
"""
if not self.ai_service:
# Fallback to simpler detection if no AI service is available
return self._simple_web_detection(prompt)
try:
# Create prompt to analyze if this is a web research request
analysis_prompt = f"""
Analyze the following request and determine if it explicitly requires web research or online information.
REQUEST: {prompt}
A request requires web research if:
1. It explicitly asks for searching information online
2. It contains URLs or references to websites
3. It requests current information that would be available on the web
4. It asks for information from web sources
5. It implicitly requires current information from the internet
Reply ONLY with a single word - either "YES" if web research is required, or "NO" if not.
"""
# Call AI for analysis
response = await self.ai_service.call_api([
{"role": "system", "content": "You determine if a request requires web research. Always respond with just YES or NO."},
{"role": "user", "content": analysis_prompt}
])
# Clean response and check
response = response.strip().upper()
return "YES" in response
except Exception as e:
# Log error but don't fail, fallback to simpler detection
logger.warning(f"Error in AI detection of web research requests: {str(e)}")
return self._simple_web_detection(prompt)
def _simple_web_detection(self, prompt: str) -> bool:
"""
Simpler fallback method for detecting web research requests based on URLs.
Args:
prompt: The user request
Returns:
True if there are clear URL indicators, False otherwise
"""
# URLs in the request strongly indicate web research
url_indicators = ["http://", "https://", "www.", ".com", ".org", ".net", ".edu", ".gov"]
web_terms = ["search", "find online", "look up", "web", "internet", "website"]
# Check for URL patterns in the request
contains_url = any(indicator in prompt.lower() for indicator in url_indicators)
contains_web_term = any(term in prompt.lower() for term in web_terms)
return contains_url or contains_web_term
async def _create_search_strategy(self, prompt: str) -> Dict[str, List[str]]:
"""
Create a search strategy based on the request.
Args:
prompt: The user request
Returns:
Search strategy with URLs and search terms
"""
if not self.ai_service:
# Fallback to simple strategy
return {"skey": [prompt], "url": []}
try:
# AI prompt to create a search strategy
strategy_prompt = f"""Create a comprehensive web research strategy for the following task:
'{prompt.replace("'","")}'
Return the results as a Python dictionary with these specific keys:
'url': A list of up to {self.max_url} specific URLs extracted from the task.
'skey': A list of up to {self.max_key} key phrases to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
If specific URLs are given and the task only requires analyzing these URLs, leave 'skey' empty.
Format your response as a valid JSON object with these two keys. Don't add any explanatory text.
"""
# Call AI for search strategy
content_text = await self.ai_service.call_api([
{"role": "system", "content": "You are a web research expert who develops precise search strategies."},
{"role": "user", "content": strategy_prompt}
])
# Remove JSON code block markers if present
if content_text.startswith("```json"):
end_marker = "```"
end_index = content_text.rfind(end_marker)
if end_index != -1:
content_text = content_text[7:end_index].strip()
elif content_text.startswith("```"):
end_marker = "```"
end_index = content_text.rfind(end_marker)
if end_index != -1:
content_text = content_text[3:end_index].strip()
# Extract only the JSON part (if surrounded by text)
json_match = re.search(r'(\{.*\})', content_text, re.DOTALL)
if json_match:
content_text = json_match.group(1)
# Parse JSON and return
strategy = json.loads(content_text)
return strategy
except Exception as e:
logger.error(f"Error creating search strategy: {str(e)}")
# Simple fallback strategy
return {"skey": [prompt], "url": []}
async def _summarize_result(self, result_data: str, original_prompt: str) -> str:
"""
Create a summary of a search result using AI.
Args:
result_data: The data to summarize
original_prompt: The original request
Returns:
Summary of the result
"""
if not self.ai_service:
return f"Summary of {len(result_data)} characters not available (AI service not available)"
try:
# Instructions for summarization
summary_prompt = f"""
Summarize this search result according to the original request in about 2000 characters.
Original request = '{original_prompt.replace("'","")}'
Focus on the most important findings and connect them to the original request.
Extract only relevant and high-quality information.
Here's the search result:
{result_data}
"""
# Call AI for summary
summary = await self.ai_service.call_api([
{"role": "system", "content": "You are an information analyst who summarizes web content precisely and relevantly."},
{"role": "user", "content": summary_prompt}
])
# Limit to ~2000 characters
return summary[:2000]
except Exception as e:
logger.error(f"Error summarizing result: {str(e)}")
return "Error creating summary"
async def _get_localized_headers(self, text: str) -> Dict[str, str]:
"""
Determine localized headers for web research results based on detected language.
Args:
text: Text for language detection
Returns:
Dictionary with localized headers
"""
# Default English headers
headers = {
"web_research_results": "Web Research Results",
"summary": "Summary",
"detailed_results": "Detailed Results",
"url": "URL",
"snippet": "Snippet",
"content": "Content"
}
if not self.ai_service:
return headers
try:
# Detect language
language_prompt = f"What language is this text written in? Answer with just the language name: {text[:200]}"
language = await self.ai_service.call_api([
{"role": "system", "content": "You determine the language of a text and return only the language name."},
{"role": "user", "content": language_prompt}
])
language = language.strip().lower()
# English language or language detection failed, return default headers
if language in ["english", "en", ""]:
return headers
# Translate headers if language recognized but no predefined translation
translation_prompt = f"""
Translate these web research result headers to {language}:
Web Research Results
Summary
Detailed Results
URL
Snippet
Content
Return a JSON object with these keys:
web_research_results, summary, detailed_results, url, snippet, content
"""
# Call AI for translation
response = await self.ai_service.call_api([
{"role": "system", "content": "You translate headers to the specified language and return them as JSON."},
{"role": "user", "content": translation_prompt}
])
# Extract JSON
json_match = re.search(r'\{.*\}', response, re.DOTALL)
if json_match:
try:
translated_headers = json.loads(json_match.group(0))
return translated_headers
except json.JSONDecodeError:
logger.warning(f"Error parsing translated headers JSON")
except Exception as e:
# Log error but continue with English headers
logger.warning(f"Error translating headers: {str(e)}")
return headers
def _search_web(self, query: str) -> List[Dict[str, str]]:
"""
Conduct a web search and return the results.
Args:
query: The search query
Returns:
List of search results
"""
formatted_query = quote_plus(query)
url = f"{APP_CONFIG.get('Agent_Webcrawler_SEARCH_ENGINE', 'https://html.duckduckgo.com/html/?q=')}{formatted_query}"
search_results_soup = self._read_url(url)
if not isinstance(search_results_soup, BeautifulSoup) or not search_results_soup.select('.result'):
logger.warning(f"No search results found for: {query}")
return []
# Extract search results
results = []
# Find all result containers
result_elements = search_results_soup.select('.result')
for result in result_elements:
# Extract title
title_element = result.select_one('.result__a')
title = title_element.text.strip() if title_element else 'No title'
# Extract URL (DuckDuckGo uses redirects)
url_element = title_element.get('href') if title_element else ''
extracted_url = 'No URL'
if url_element:
# Extract actual URL from DuckDuckGo's redirect
if url_element.startswith('/d.js?q='):
start = url_element.find('?q=') + 3
end = url_element.find('&', start) if '&' in url_element[start:] else None
extracted_url = unquote(url_element[start:end])
# Ensure URL has correct protocol prefix
if not extracted_url.startswith(('http://', 'https://')):
if not extracted_url.startswith('//'):
extracted_url = 'https://' + extracted_url
else:
extracted_url = 'https:' + extracted_url
else:
extracted_url = url_element
# Extract snippet directly from search results page
snippet_element = result.select_one('.result__snippet')
snippet = snippet_element.text.strip() if snippet_element else 'No description'
# Get actual page content for the data field
target_page_soup = self._read_url(extracted_url)
# Use new content extraction method to limit content size
content = self._extract_main_content(target_page_soup)
results.append({
'title': title,
'url': extracted_url,
'snippet': snippet,
'data': content
})
# Limit number of results if needed
if len(results) >= self.max_result:
break
return results
def _read_url(self, url: str) -> BeautifulSoup:
"""
Read a URL and return a BeautifulSoup parser for the content.
Args:
url: The URL to read
Returns:
BeautifulSoup object with the content or empty on errors
"""
headers = {
'User-Agent': APP_CONFIG.get("Agent_Webcrawler_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"),
'Accept': 'text/html,application/xhtml+xml,application/xml',
'Accept-Language': 'en-US,en;q=0.9',
}
try:
# Initial request
response = requests.get(url, headers=headers, timeout=self.timeout)
# Handling for status 202
if response.status_code == 202:
# Max 3 retries with increasing intervals
backoff_times = [0.5, 1.0, 2.0, 5.0]
for wait_time in backoff_times:
time.sleep(wait_time) # Wait with increasing time
response = requests.get(url, headers=headers, timeout=self.timeout)
# If no more 202, break
if response.status_code != 202:
break
# Raise for other error status codes
response.raise_for_status()
# Parse HTML
return BeautifulSoup(response.text, 'html.parser')
except Exception as e:
logger.error(f"Error reading URL {url}: {str(e)}")
# Create empty BeautifulSoup object
return BeautifulSoup("<html><body></body></html>", 'html.parser')
def _extract_title(self, soup: BeautifulSoup, url: str) -> str:
"""
Extract the title from a webpage.
Args:
soup: BeautifulSoup object of the webpage
url: URL of the webpage
Returns:
Extracted title
"""
if not isinstance(soup, BeautifulSoup):
return f"Error with {url}"
# Extract title from title tag
title_tag = soup.find('title')
title = title_tag.text.strip() if title_tag else "No title"
# Alternative: Also look for h1 tags if title tag is missing
if title == "No title":
h1_tag = soup.find('h1')
if h1_tag:
title = h1_tag.text.strip()
return title
def _extract_main_content(self, soup: BeautifulSoup, max_chars: int = 10000) -> str:
"""
Extract the main content from an HTML page.
Args:
soup: BeautifulSoup object of the webpage
max_chars: Maximum number of characters
Returns:
Extracted main content as a string
"""
if not isinstance(soup, BeautifulSoup):
return str(soup)[:max_chars] if soup else ""
# Try to find main content elements in priority order
main_content = None
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
content = soup.select_one(selector)
if content:
main_content = content
break
# If no main content found, use the body
if not main_content:
main_content = soup.find('body') or soup
# Remove script, style, nav, footer elements that don't contribute to main content
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
element.extract()
# Extract text content
text_content = main_content.get_text(separator=' ', strip=True)
# Limit to max_chars
return text_content[:max_chars]
def _parse_result(self, soup: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
"""
Parse a BeautifulSoup object into a result dictionary.
Args:
soup: BeautifulSoup object of the webpage
title: Page title
url: Page URL
Returns:
Dictionary with result data
"""
# Extract content
content = self._extract_main_content(soup)
result = {
'title': title,
'url': url,
'snippet': 'No description', # Default value
'data': content
}
return result
def _limit_text(self, text: str, max_chars: int = 10000) -> str:
"""
Limit text to a maximum number of characters.
Args:
text: Input text
max_chars: Maximum number of characters
Returns:
Limited text
"""
if not text:
return ""
# If text is already under the limit, return unchanged
if len(text) <= max_chars:
return text
# Otherwise limit text to max_chars
return text[:max_chars] + "... [Content truncated due to length]"
# Factory function for the Webcrawler agent
def get_webcrawler_agent():
"""
Factory function that returns an instance of the Webcrawler agent.
Returns:
An instance of the Webcrawler agent
"""
return AgentWebcrawler()