769 lines
No EOL
30 KiB
Python
769 lines
No EOL
30 KiB
Python
"""
|
|
Webcrawler agent for research and retrieval of information from the web.
|
|
Optimized for the new task-based processing.
|
|
"""
|
|
|
|
import logging
|
|
import json
|
|
import re
|
|
import time
|
|
from typing import Dict, Any, List
|
|
from urllib.parse import quote_plus, unquote
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
import markdown
|
|
|
|
from modules.chat_registry import AgentBase
|
|
from modules.configuration import APP_CONFIG
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class AgentWebcrawler(AgentBase):
|
|
"""Agent for web research and information retrieval"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the webcrawler agent"""
|
|
super().__init__()
|
|
self.name = "webcrawler"
|
|
self.description = "Conducts web research and collects information from online sources"
|
|
self.capabilities = [
|
|
"web_search",
|
|
"information_retrieval",
|
|
"data_collection",
|
|
"search_results_analysis",
|
|
"webpage_content_extraction"
|
|
]
|
|
|
|
# Web crawling configuration
|
|
self.max_url = int(APP_CONFIG.get("Agent_Webcrawler_MAX_URLS", "5"))
|
|
self.max_key = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_KEYWORDS", "3"))
|
|
self.max_result = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_RESULTS", "5"))
|
|
self.timeout = int(APP_CONFIG.get("Agent_Webcrawler_TIMEOUT", "30"))
|
|
|
|
def set_dependencies(self, ai_service=None):
|
|
"""Set external dependencies for the agent."""
|
|
self.ai_service = ai_service
|
|
|
|
|
|
async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Process a standardized task structure and conduct web research.
|
|
|
|
Args:
|
|
task: A dictionary containing:
|
|
- task_id: Unique ID for this task
|
|
- prompt: The main instruction for the agent
|
|
- input_documents: List of documents to process
|
|
- output_specifications: List of required output documents
|
|
- context: Additional contextual information
|
|
|
|
Returns:
|
|
A dictionary containing:
|
|
- feedback: Text response explaining the research results
|
|
- documents: List of created document objects
|
|
"""
|
|
try:
|
|
# Extract relevant task information
|
|
prompt = task.get("prompt", "")
|
|
output_specs = task.get("output_specifications", [])
|
|
|
|
# Check if AI service is available
|
|
if not self.ai_service:
|
|
logger.error("No AI service configured for the Webcrawler agent")
|
|
return {
|
|
"feedback": "The Webcrawler agent is not properly configured.",
|
|
"documents": []
|
|
}
|
|
|
|
# Check if this is a web research request
|
|
is_web_research = await self._is_web_research_request(prompt)
|
|
if not is_web_research:
|
|
logger.info("Request rejected: not a web research task")
|
|
return {
|
|
"feedback": "This request doesn't appear to require web research.",
|
|
"documents": []
|
|
}
|
|
|
|
# Proceed with web research
|
|
logger.info(f"Web research for: {prompt[:50]}...")
|
|
|
|
# Create search strategy
|
|
search_strategy = await self._create_search_strategy(prompt)
|
|
search_keys = search_strategy.get("skey", [])
|
|
search_urls = search_strategy.get("url", [])
|
|
|
|
if search_keys:
|
|
logger.info(f"Searching for {len(search_keys)} key terms: {', '.join(search_keys[:2])}...")
|
|
|
|
if search_urls:
|
|
logger.info(f"Searching in {len(search_urls)} direct URLs: {', '.join(search_urls[:2])}...")
|
|
|
|
# Execute search
|
|
results = []
|
|
|
|
# Process search terms
|
|
for keyword in search_keys:
|
|
logger.info(f"Searching the web for: '{keyword}'")
|
|
keyword_results = self._search_web(keyword)
|
|
results.extend(keyword_results)
|
|
logger.info(f"Found: {len(keyword_results)} results for '{keyword}'")
|
|
|
|
# Process direct URLs
|
|
for url in search_urls:
|
|
logger.info(f"Extracting content from: {url}")
|
|
soup = self._read_url(url)
|
|
|
|
# Extract title from the page, if available
|
|
title = self._extract_title(soup, url)
|
|
|
|
result = self._parse_result(soup, title, url)
|
|
results.append(result)
|
|
logger.info(f"Extracted: '{title}' from {url}")
|
|
|
|
# Process results for final output
|
|
logger.info(f"Analyzing {len(results)} web results")
|
|
|
|
# Generate summaries for each result
|
|
processed_results = []
|
|
for i, result in enumerate(results):
|
|
result_data_limited = self._limit_text(result['data'], max_chars=10000)
|
|
|
|
logger.info(f"Analyzing result {i+1}/{len(results)}: {result['title'][:30]}...")
|
|
|
|
# No AI service available, create minimal summary
|
|
if not self.ai_service:
|
|
content_summary = f"Extract from {result['url']} ({len(result_data_limited)} characters)"
|
|
else:
|
|
# Generate summary with AI
|
|
content_summary = await self._summarize_result(result_data_limited, prompt)
|
|
|
|
processed_result = {
|
|
"title": result['title'],
|
|
"url": result['url'],
|
|
"snippet": result['snippet'],
|
|
"summary": content_summary
|
|
}
|
|
|
|
processed_results.append(processed_result)
|
|
|
|
# Create overall summary
|
|
all_summaries = "\n\n".join([r["summary"] for r in processed_results])
|
|
all_summaries_limited = self._limit_text(all_summaries, max_chars=10000)
|
|
|
|
logger.info("Creating overall summary of web research")
|
|
|
|
if not self.ai_service:
|
|
final_summary = f"Summary of {len(processed_results)} web research results"
|
|
else:
|
|
final_summary = await self.ai_service.call_api([
|
|
{"role": "system", "content": "You create concise summaries of research results."},
|
|
{"role": "user", "content": f"Please summarize these findings in 5-6 sentences: {all_summaries_limited}\n"}
|
|
])
|
|
|
|
# Get localized headers for output
|
|
headers = await self._get_localized_headers(prompt)
|
|
|
|
# Create document objects based on output specifications
|
|
generated_documents = []
|
|
|
|
# Generate appropriate document for each requested output
|
|
for spec in output_specs:
|
|
output_label = spec.get("label", "")
|
|
output_description = spec.get("description", "")
|
|
|
|
# Determine output format based on file extension
|
|
format_type = self._determine_format_type(output_label)
|
|
|
|
# Generate content based on format and requirements
|
|
if format_type == "markdown" or format_type == "text":
|
|
content = self._format_results_as_markdown(processed_results, final_summary, headers)
|
|
elif format_type == "html":
|
|
md_content = self._format_results_as_markdown(processed_results, final_summary, headers)
|
|
content = markdown.markdown(md_content)
|
|
elif format_type == "json":
|
|
content = json.dumps({
|
|
"summary": final_summary,
|
|
"results": processed_results
|
|
}, indent=2, ensure_ascii=False)
|
|
elif format_type == "csv":
|
|
csv_lines = ["Title,URL,Snippet"]
|
|
for result in processed_results:
|
|
# Escape commas and quotes in fields
|
|
title = result["title"].replace('"', '""')
|
|
url = result["url"].replace('"', '""')
|
|
snippet = result["snippet"].replace('"', '""')
|
|
csv_line = f'"{title}","{url}","{snippet}"'
|
|
csv_lines.append(csv_line)
|
|
content = "\n".join(csv_lines)
|
|
else:
|
|
# Default: Markdown
|
|
content = self._format_results_as_markdown(processed_results, final_summary, headers)
|
|
|
|
# Add document to results list
|
|
generated_documents.append({
|
|
"label": output_label,
|
|
"content": content
|
|
})
|
|
|
|
# If no specific outputs requested, return standard document
|
|
if not output_specs:
|
|
content = self._format_results_as_markdown(processed_results, final_summary, headers)
|
|
generated_documents.append({
|
|
"label": "web_research_results.md",
|
|
"content": content
|
|
})
|
|
|
|
# Create feedback for response
|
|
feedback = f"I conducted web research on '{prompt[:50]}...' and found {len(processed_results)} relevant results."
|
|
|
|
logger.info("Web research completed successfully")
|
|
|
|
return {
|
|
"feedback": feedback,
|
|
"documents": generated_documents
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error during web research: {str(e)}"
|
|
logger.error(error_msg)
|
|
return {
|
|
"feedback": f"An error occurred during the web research: {str(e)}",
|
|
"documents": []
|
|
}
|
|
|
|
|
|
def _determine_format_type(self, output_label: str) -> str:
|
|
"""
|
|
Determine the format type based on the filename.
|
|
|
|
Args:
|
|
output_label: Output filename
|
|
|
|
Returns:
|
|
Format type (markdown, html, text, json, csv)
|
|
"""
|
|
output_label_lower = output_label.lower()
|
|
|
|
if output_label_lower.endswith(".md"):
|
|
return "markdown"
|
|
elif output_label_lower.endswith(".html"):
|
|
return "html"
|
|
elif output_label_lower.endswith(".txt"):
|
|
return "text"
|
|
elif output_label_lower.endswith(".json"):
|
|
return "json"
|
|
elif output_label_lower.endswith(".csv"):
|
|
return "csv"
|
|
else:
|
|
# Default to markdown
|
|
return "markdown"
|
|
|
|
def _format_results_as_markdown(self, results: List[Dict[str, Any]],
|
|
summary: str, headers: Dict[str, str]) -> str:
|
|
"""
|
|
Format research results as markdown.
|
|
|
|
Args:
|
|
results: List of results
|
|
summary: Summary of all results
|
|
headers: Localized headers
|
|
|
|
Returns:
|
|
Formatted markdown text
|
|
"""
|
|
md_content = f"# {headers['web_research_results']}\n\n"
|
|
|
|
md_content += f"## {headers['summary']}\n\n{summary}\n\n"
|
|
|
|
if results:
|
|
md_content += f"## {headers['detailed_results']}\n\n"
|
|
|
|
for i, result in enumerate(results, 1):
|
|
md_content += f"### {i}. {result['title']}\n\n"
|
|
md_content += f"**{headers['url']}**: {result['url']}\n\n"
|
|
md_content += f"**{headers['snippet']}**: {result['snippet']}\n\n"
|
|
md_content += f"**{headers['content']}**: {result['summary']}\n\n"
|
|
|
|
# Add separator between results (except for the last one)
|
|
if i < len(results):
|
|
md_content += "---\n\n"
|
|
|
|
return md_content
|
|
|
|
async def _is_web_research_request(self, prompt: str) -> bool:
|
|
"""
|
|
Use AI to determine if a request requires web research.
|
|
|
|
Args:
|
|
prompt: The user request
|
|
|
|
Returns:
|
|
True if it is explicitly a web research request, False otherwise
|
|
"""
|
|
if not self.ai_service:
|
|
# Fallback to simpler detection if no AI service is available
|
|
return self._simple_web_detection(prompt)
|
|
|
|
try:
|
|
# Create prompt to analyze if this is a web research request
|
|
analysis_prompt = f"""
|
|
Analyze the following request and determine if it explicitly requires web research or online information.
|
|
|
|
REQUEST: {prompt}
|
|
|
|
A request requires web research if:
|
|
1. It explicitly asks for searching information online
|
|
2. It contains URLs or references to websites
|
|
3. It requests current information that would be available on the web
|
|
4. It asks for information from web sources
|
|
5. It implicitly requires current information from the internet
|
|
|
|
Reply ONLY with a single word - either "YES" if web research is required, or "NO" if not.
|
|
"""
|
|
|
|
# Call AI for analysis
|
|
response = await self.ai_service.call_api([
|
|
{"role": "system", "content": "You determine if a request requires web research. Always respond with just YES or NO."},
|
|
{"role": "user", "content": analysis_prompt}
|
|
])
|
|
|
|
# Clean response and check
|
|
response = response.strip().upper()
|
|
|
|
return "YES" in response
|
|
|
|
except Exception as e:
|
|
# Log error but don't fail, fallback to simpler detection
|
|
logger.warning(f"Error in AI detection of web research requests: {str(e)}")
|
|
return self._simple_web_detection(prompt)
|
|
|
|
def _simple_web_detection(self, prompt: str) -> bool:
|
|
"""
|
|
Simpler fallback method for detecting web research requests based on URLs.
|
|
|
|
Args:
|
|
prompt: The user request
|
|
|
|
Returns:
|
|
True if there are clear URL indicators, False otherwise
|
|
"""
|
|
# URLs in the request strongly indicate web research
|
|
url_indicators = ["http://", "https://", "www.", ".com", ".org", ".net", ".edu", ".gov"]
|
|
web_terms = ["search", "find online", "look up", "web", "internet", "website"]
|
|
|
|
# Check for URL patterns in the request
|
|
contains_url = any(indicator in prompt.lower() for indicator in url_indicators)
|
|
contains_web_term = any(term in prompt.lower() for term in web_terms)
|
|
|
|
return contains_url or contains_web_term
|
|
|
|
async def _create_search_strategy(self, prompt: str) -> Dict[str, List[str]]:
|
|
"""
|
|
Create a search strategy based on the request.
|
|
|
|
Args:
|
|
prompt: The user request
|
|
|
|
Returns:
|
|
Search strategy with URLs and search terms
|
|
"""
|
|
if not self.ai_service:
|
|
# Fallback to simple strategy
|
|
return {"skey": [prompt], "url": []}
|
|
|
|
try:
|
|
# AI prompt to create a search strategy
|
|
strategy_prompt = f"""Create a comprehensive web research strategy for the following task:
|
|
'{prompt.replace("'","")}'
|
|
|
|
Return the results as a Python dictionary with these specific keys:
|
|
|
|
'url': A list of up to {self.max_url} specific URLs extracted from the task.
|
|
|
|
'skey': A list of up to {self.max_key} key phrases to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
|
|
|
If specific URLs are given and the task only requires analyzing these URLs, leave 'skey' empty.
|
|
|
|
Format your response as a valid JSON object with these two keys. Don't add any explanatory text.
|
|
"""
|
|
|
|
# Call AI for search strategy
|
|
content_text = await self.ai_service.call_api([
|
|
{"role": "system", "content": "You are a web research expert who develops precise search strategies."},
|
|
{"role": "user", "content": strategy_prompt}
|
|
])
|
|
|
|
# Remove JSON code block markers if present
|
|
if content_text.startswith("```json"):
|
|
end_marker = "```"
|
|
end_index = content_text.rfind(end_marker)
|
|
if end_index != -1:
|
|
content_text = content_text[7:end_index].strip()
|
|
elif content_text.startswith("```"):
|
|
end_marker = "```"
|
|
end_index = content_text.rfind(end_marker)
|
|
if end_index != -1:
|
|
content_text = content_text[3:end_index].strip()
|
|
|
|
# Extract only the JSON part (if surrounded by text)
|
|
json_match = re.search(r'(\{.*\})', content_text, re.DOTALL)
|
|
if json_match:
|
|
content_text = json_match.group(1)
|
|
|
|
# Parse JSON and return
|
|
strategy = json.loads(content_text)
|
|
return strategy
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating search strategy: {str(e)}")
|
|
# Simple fallback strategy
|
|
return {"skey": [prompt], "url": []}
|
|
|
|
async def _summarize_result(self, result_data: str, original_prompt: str) -> str:
|
|
"""
|
|
Create a summary of a search result using AI.
|
|
|
|
Args:
|
|
result_data: The data to summarize
|
|
original_prompt: The original request
|
|
|
|
Returns:
|
|
Summary of the result
|
|
"""
|
|
if not self.ai_service:
|
|
return f"Summary of {len(result_data)} characters not available (AI service not available)"
|
|
|
|
try:
|
|
# Instructions for summarization
|
|
summary_prompt = f"""
|
|
Summarize this search result according to the original request in about 2000 characters.
|
|
|
|
Original request = '{original_prompt.replace("'","")}'
|
|
|
|
Focus on the most important findings and connect them to the original request.
|
|
Extract only relevant and high-quality information.
|
|
|
|
Here's the search result:
|
|
{result_data}
|
|
"""
|
|
|
|
# Call AI for summary
|
|
summary = await self.ai_service.call_api([
|
|
{"role": "system", "content": "You are an information analyst who summarizes web content precisely and relevantly."},
|
|
{"role": "user", "content": summary_prompt}
|
|
])
|
|
|
|
# Limit to ~2000 characters
|
|
return summary[:2000]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error summarizing result: {str(e)}")
|
|
return "Error creating summary"
|
|
|
|
async def _get_localized_headers(self, text: str) -> Dict[str, str]:
|
|
"""
|
|
Determine localized headers for web research results based on detected language.
|
|
|
|
Args:
|
|
text: Text for language detection
|
|
|
|
Returns:
|
|
Dictionary with localized headers
|
|
"""
|
|
# Default English headers
|
|
headers = {
|
|
"web_research_results": "Web Research Results",
|
|
"summary": "Summary",
|
|
"detailed_results": "Detailed Results",
|
|
"url": "URL",
|
|
"snippet": "Snippet",
|
|
"content": "Content"
|
|
}
|
|
|
|
if not self.ai_service:
|
|
return headers
|
|
|
|
try:
|
|
# Detect language
|
|
language_prompt = f"What language is this text written in? Answer with just the language name: {text[:200]}"
|
|
language = await self.ai_service.call_api([
|
|
{"role": "system", "content": "You determine the language of a text and return only the language name."},
|
|
{"role": "user", "content": language_prompt}
|
|
])
|
|
|
|
language = language.strip().lower()
|
|
|
|
# English language or language detection failed, return default headers
|
|
if language in ["english", "en", ""]:
|
|
return headers
|
|
|
|
# Translate headers if language recognized but no predefined translation
|
|
translation_prompt = f"""
|
|
Translate these web research result headers to {language}:
|
|
|
|
Web Research Results
|
|
Summary
|
|
Detailed Results
|
|
URL
|
|
Snippet
|
|
Content
|
|
|
|
Return a JSON object with these keys:
|
|
web_research_results, summary, detailed_results, url, snippet, content
|
|
"""
|
|
|
|
# Call AI for translation
|
|
response = await self.ai_service.call_api([
|
|
{"role": "system", "content": "You translate headers to the specified language and return them as JSON."},
|
|
{"role": "user", "content": translation_prompt}
|
|
])
|
|
|
|
# Extract JSON
|
|
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
|
|
|
if json_match:
|
|
try:
|
|
translated_headers = json.loads(json_match.group(0))
|
|
return translated_headers
|
|
except json.JSONDecodeError:
|
|
logger.warning(f"Error parsing translated headers JSON")
|
|
|
|
except Exception as e:
|
|
# Log error but continue with English headers
|
|
logger.warning(f"Error translating headers: {str(e)}")
|
|
|
|
return headers
|
|
|
|
def _search_web(self, query: str) -> List[Dict[str, str]]:
|
|
"""
|
|
Conduct a web search and return the results.
|
|
|
|
Args:
|
|
query: The search query
|
|
|
|
Returns:
|
|
List of search results
|
|
"""
|
|
formatted_query = quote_plus(query)
|
|
url = f"{APP_CONFIG.get('Agent_Webcrawler_SEARCH_ENGINE', 'https://html.duckduckgo.com/html/?q=')}{formatted_query}"
|
|
|
|
search_results_soup = self._read_url(url)
|
|
if not isinstance(search_results_soup, BeautifulSoup) or not search_results_soup.select('.result'):
|
|
logger.warning(f"No search results found for: {query}")
|
|
return []
|
|
|
|
# Extract search results
|
|
results = []
|
|
|
|
# Find all result containers
|
|
result_elements = search_results_soup.select('.result')
|
|
|
|
for result in result_elements:
|
|
# Extract title
|
|
title_element = result.select_one('.result__a')
|
|
title = title_element.text.strip() if title_element else 'No title'
|
|
|
|
# Extract URL (DuckDuckGo uses redirects)
|
|
url_element = title_element.get('href') if title_element else ''
|
|
extracted_url = 'No URL'
|
|
|
|
if url_element:
|
|
# Extract actual URL from DuckDuckGo's redirect
|
|
if url_element.startswith('/d.js?q='):
|
|
start = url_element.find('?q=') + 3
|
|
end = url_element.find('&', start) if '&' in url_element[start:] else None
|
|
extracted_url = unquote(url_element[start:end])
|
|
|
|
# Ensure URL has correct protocol prefix
|
|
if not extracted_url.startswith(('http://', 'https://')):
|
|
if not extracted_url.startswith('//'):
|
|
extracted_url = 'https://' + extracted_url
|
|
else:
|
|
extracted_url = 'https:' + extracted_url
|
|
else:
|
|
extracted_url = url_element
|
|
|
|
# Extract snippet directly from search results page
|
|
snippet_element = result.select_one('.result__snippet')
|
|
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
|
|
|
# Get actual page content for the data field
|
|
target_page_soup = self._read_url(extracted_url)
|
|
|
|
# Use new content extraction method to limit content size
|
|
content = self._extract_main_content(target_page_soup)
|
|
|
|
results.append({
|
|
'title': title,
|
|
'url': extracted_url,
|
|
'snippet': snippet,
|
|
'data': content
|
|
})
|
|
|
|
# Limit number of results if needed
|
|
if len(results) >= self.max_result:
|
|
break
|
|
|
|
return results
|
|
|
|
def _read_url(self, url: str) -> BeautifulSoup:
|
|
"""
|
|
Read a URL and return a BeautifulSoup parser for the content.
|
|
|
|
Args:
|
|
url: The URL to read
|
|
|
|
Returns:
|
|
BeautifulSoup object with the content or empty on errors
|
|
"""
|
|
headers = {
|
|
'User-Agent': APP_CONFIG.get("Agent_Webcrawler_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"),
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
}
|
|
|
|
try:
|
|
# Initial request
|
|
response = requests.get(url, headers=headers, timeout=self.timeout)
|
|
|
|
# Handling for status 202
|
|
if response.status_code == 202:
|
|
# Max 3 retries with increasing intervals
|
|
backoff_times = [0.5, 1.0, 2.0, 5.0]
|
|
|
|
for wait_time in backoff_times:
|
|
time.sleep(wait_time) # Wait with increasing time
|
|
response = requests.get(url, headers=headers, timeout=self.timeout)
|
|
|
|
# If no more 202, break
|
|
if response.status_code != 202:
|
|
break
|
|
|
|
# Raise for other error status codes
|
|
response.raise_for_status()
|
|
|
|
# Parse HTML
|
|
return BeautifulSoup(response.text, 'html.parser')
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error reading URL {url}: {str(e)}")
|
|
# Create empty BeautifulSoup object
|
|
return BeautifulSoup("<html><body></body></html>", 'html.parser')
|
|
|
|
def _extract_title(self, soup: BeautifulSoup, url: str) -> str:
|
|
"""
|
|
Extract the title from a webpage.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object of the webpage
|
|
url: URL of the webpage
|
|
|
|
Returns:
|
|
Extracted title
|
|
"""
|
|
if not isinstance(soup, BeautifulSoup):
|
|
return f"Error with {url}"
|
|
|
|
# Extract title from title tag
|
|
title_tag = soup.find('title')
|
|
title = title_tag.text.strip() if title_tag else "No title"
|
|
|
|
# Alternative: Also look for h1 tags if title tag is missing
|
|
if title == "No title":
|
|
h1_tag = soup.find('h1')
|
|
if h1_tag:
|
|
title = h1_tag.text.strip()
|
|
|
|
return title
|
|
|
|
def _extract_main_content(self, soup: BeautifulSoup, max_chars: int = 10000) -> str:
|
|
"""
|
|
Extract the main content from an HTML page.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object of the webpage
|
|
max_chars: Maximum number of characters
|
|
|
|
Returns:
|
|
Extracted main content as a string
|
|
"""
|
|
if not isinstance(soup, BeautifulSoup):
|
|
return str(soup)[:max_chars] if soup else ""
|
|
|
|
# Try to find main content elements in priority order
|
|
main_content = None
|
|
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
|
|
content = soup.select_one(selector)
|
|
if content:
|
|
main_content = content
|
|
break
|
|
|
|
# If no main content found, use the body
|
|
if not main_content:
|
|
main_content = soup.find('body') or soup
|
|
|
|
# Remove script, style, nav, footer elements that don't contribute to main content
|
|
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
|
|
element.extract()
|
|
|
|
# Extract text content
|
|
text_content = main_content.get_text(separator=' ', strip=True)
|
|
|
|
# Limit to max_chars
|
|
return text_content[:max_chars]
|
|
|
|
def _parse_result(self, soup: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
|
|
"""
|
|
Parse a BeautifulSoup object into a result dictionary.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object of the webpage
|
|
title: Page title
|
|
url: Page URL
|
|
|
|
Returns:
|
|
Dictionary with result data
|
|
"""
|
|
# Extract content
|
|
content = self._extract_main_content(soup)
|
|
|
|
result = {
|
|
'title': title,
|
|
'url': url,
|
|
'snippet': 'No description', # Default value
|
|
'data': content
|
|
}
|
|
return result
|
|
|
|
def _limit_text(self, text: str, max_chars: int = 10000) -> str:
|
|
"""
|
|
Limit text to a maximum number of characters.
|
|
|
|
Args:
|
|
text: Input text
|
|
max_chars: Maximum number of characters
|
|
|
|
Returns:
|
|
Limited text
|
|
"""
|
|
if not text:
|
|
return ""
|
|
|
|
# If text is already under the limit, return unchanged
|
|
if len(text) <= max_chars:
|
|
return text
|
|
|
|
# Otherwise limit text to max_chars
|
|
return text[:max_chars] + "... [Content truncated due to length]"
|
|
|
|
|
|
# Factory function for the Webcrawler agent
|
|
def get_webcrawler_agent():
|
|
"""
|
|
Factory function that returns an instance of the Webcrawler agent.
|
|
|
|
Returns:
|
|
An instance of the Webcrawler agent
|
|
"""
|
|
return AgentWebcrawler() |