gateway/modules/agentservice_agent_webcrawler.py
2025-04-16 21:42:26 +02:00

1056 lines
No EOL
46 KiB
Python

"""
WebCrawler-Agent for research and retrieval of information from the web.
Adapted for the refactored Core-Module with language-agnostic detection.
"""
import json
import logging
import time
import traceback
from typing import List, Dict, Any, Optional
from urllib.parse import quote_plus, unquote
from bs4 import BeautifulSoup
import requests
from modules.agentservice_base import BaseAgent
from modules.agentservice_utils import MessageUtils, LoggingUtils
from modules.agentservice_protocol import AgentCommunicationProtocol
from modules.utility import APP_CONFIG
logger = logging.getLogger(__name__)
class WebcrawlerAgent(BaseAgent):
"""Agent for Web Research and Information Retrieval"""
def __init__(self):
"""Initialize the WebCrawler Agent"""
super().__init__()
self.id = "webcrawler"
self.name = "Webscraper"
self.type = "scraper"
self.description = "Researches information on the web"
self.capabilities = "web_search,information_retrieval,data_collection,source_verification,content_integration"
self.result_format = "SearchResults"
# Add enhanced document capabilities
self.supports_documents = True
self.document_capabilities = ["read", "create"]
self.required_context = ["workflow_id"]
self.document_handler = None
# Initialize AI service
self.ai_service = None
# Initialize protocol
self.protocol = AgentCommunicationProtocol()
# Initialize utility classes
self.message_utils = MessageUtils()
# Web-Crawling configuration
self.max_url = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_URLS"))
self.max_key = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_SEARCH_KEYWORDS"))
self.max_result = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_SEARCH_RESULTS"))
self.timeout = int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT"))
def get_agent_info(self) -> Dict[str, Any]:
"""Get agent information for agent registry"""
info = super().get_agent_info()
info.update({
"metadata": {
"max_url": self.max_url,
"max_result": self.max_result,
"timeout": self.timeout
}
})
return info
def set_document_handler(self, document_handler):
"""Set the document handler for file operations"""
self.document_handler = document_handler
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
"""
Process a message and conduct web research if appropriate.
Args:
message: The message to process
context: Additional context
Returns:
The generated response or rejection if not a web research request
"""
# Extract workflow_id from context or message
workflow_id = context.get("workflow_id") if context else message.get("workflow_id", "unknown")
# Get or create logging_utils
log_func = context.get("log_func") if context else None
logging_utils = LoggingUtils(workflow_id, log_func)
# Create response structure
response = {
"role": "assistant",
"content": "",
"agent_id": self.id,
"agent_type": self.type,
"agent_name": self.name,
"result_format": self.result_format,
"workflow_id": workflow_id
}
try:
# Get the query from the message
prompt = await self.get_prompt(message)
# Check if this is explicitly a web research request using AI
is_web_research = await self._is_web_research_request_ai(prompt)
if not is_web_research:
# Reject non-web research requests
logging_utils.info("Request rejected: not a web research task", "agents")
response["content"] = "This request doesn't appear to require web research. Redirecting to a more appropriate agent."
response["status"] = "rejected"
response["rejection_reason"] = "not_web_research"
return response
# Continue with web research process
logging_utils.info(f"Web research for: {prompt[:50]}...", "agents")
# Send status update using protocol
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Starting web research",
sender_id=self.id,
status="in_progress",
progress=0.0,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Update progress using protocol - 10% for starting the query analysis
if log_func:
status_message = self.protocol.create_status_update_message(
status_description=f"Analyzing search strategy for: {prompt[:30]}...",
sender_id=self.id,
status="in_progress",
progress=0.1,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Prepare the web query strategy
try:
# Log progress - 20% for query strategy preparation
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Creating search strategy",
sender_id=self.id,
status="in_progress",
progress=0.2,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Get the query strategy
content_text = await self.ai_service.call_api(
messages=[
{
"role": "system",
"content": "You are a web research expert who develops precise search strategies."
},
{
"role": "user",
"content": f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
"""
}
]
)
# Try to parse the JSON result
if content_text.startswith("```json"):
# Find the end of the JSON block
end_marker = "```"
end_index = content_text.rfind(end_marker)
if end_index != -1:
# Extract the JSON content without the markdown markers
content_text = content_text[7:end_index].strip()
try:
logger.info(f"Valid json received: {str(content_text)}")
pjson = json.loads(content_text)
# Log parsed search strategy
search_keys = pjson.get("skey", [])
search_urls = pjson.get("url", [])
if search_keys:
logging_utils.info(f"Searching for {len(search_keys)} key terms: {', '.join(search_keys[:2])}...", "agents")
if search_urls:
logging_utils.info(f"Searching in {len(search_urls)} direct URLs: {', '.join(search_urls[:2])}...", "agents")
# Log progress - 30% for starting the search
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Starting web search",
sender_id=self.id,
status="in_progress",
progress=0.3,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Execute the search
results = []
total_tasks = len(search_keys) + len(search_urls)
tasks_completed = 0
# Process search keywords
for keyword in search_keys:
logging_utils.info(f"Searching web for: '{keyword}'", "agents")
# Log specific keyword search progress
if log_func:
progress_pct = 0.3 + (0.5 * (tasks_completed / total_tasks))
status_message = self.protocol.create_status_update_message(
status_description=f"Searching for: '{keyword}'",
sender_id=self.id,
status="in_progress",
progress=progress_pct,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
keyword_results = self.search_web(keyword)
results.extend(keyword_results)
logging_utils.info(f"Found: {len(keyword_results)} results for '{keyword}'", "agents")
tasks_completed += 1
# Process direct URLs
for url in search_urls:
logging_utils.info(f"Extracting content from: {url}", "agents")
# Log specific URL extraction progress
if log_func:
progress_pct = 0.3 + (0.5 * (tasks_completed / total_tasks))
status_message = self.protocol.create_status_update_message(
status_description=f"Reading URL: {url}",
sender_id=self.id,
status="in_progress",
progress=progress_pct,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
soup = self.read_url(url)
# Extract title from the page if it exists
if isinstance(soup, BeautifulSoup):
title_tag = soup.find('title')
title = title_tag.text.strip() if title_tag else "No title"
# Alternative: You could also look for h1 tags if the title tag is missing
if title == "No title":
h1_tag = soup.find('h1')
if h1_tag:
title = h1_tag.text.strip()
else:
# Handle the case where soup is an error message string
title = "Error fetching page"
result = self.parse_result(soup, title, url)
results.append(result)
logging_utils.info(f"Extracted: '{title}' from {url}", "agents")
tasks_completed += 1
# Log progress - 80% for processing results
if log_func:
status_message = self.protocol.create_status_update_message(
status_description=f"Analyzing {len(results)} search results",
sender_id=self.id,
status="in_progress",
progress=0.8,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Process results for the final output
logging_utils.info(f"Analyzing {len(results)} web results", "agents")
# Generate summaries for each result
processed_results = []
for i, result in enumerate(results):
result_data_limited = self.limit_text_for_api(result['data'], max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))
# Log individual result processing
logging_utils.info(f"Analyzing result {i+1}/{len(results)}: {result['title'][:30]}...", "agents")
web_answer_instructions = f"""
Summarize this search result according to the original request in approximately 2000 characters. Original request = '{prompt.replace("'","")}'
Focus on the most important insights and connect them to the original request. You can skip any introduction.
Extract only relevant and high-quality information related to the request, and present it in a clear format. Provide a balanced view of the researched information.
Here is the search result:
{result_data_limited}
"""
content_summary = await self.ai_service.call_api(
messages=[
{
"role": "system",
"content": "You are an information analyst who precisely and relevantly summarizes web content."
},
{
"role": "user",
"content": web_answer_instructions
}
]
)
# Limit summary to ~2000 characters
content_summary = content_summary[:2000]
processed_result = {
"title": result['title'],
"url": result['url'],
"snippet": result['snippet'],
"summary": content_summary
}
processed_results.append(processed_result)
# Log progress - 90% for creating final summary
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Creating overall summary",
sender_id=self.id,
status="in_progress",
progress=0.9,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Create the final combined summary
all_summaries = "\n\n".join([r["summary"] for r in processed_results])
all_summaries_limited = self.limit_text_for_api(all_summaries, max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))
logging_utils.info("Creating overall summary of web research", "agents")
final_summary = await self.ai_service.call_api(
messages=[
{
"role": "system",
"content": "You create concise summaries of research findings."
},
{
"role": "user",
"content": f"Please summarize these findings in 5-6 sentences: {all_summaries_limited}\n"
}
]
)
# Get the language of the request to use for result headers
request_language_analysis = await self.ai_service.call_api(
messages=[
{
"role": "system",
"content": "You determine the language of a text and return only the language name."
},
{
"role": "user",
"content": f"What language is this text in? Only respond with the language name: {prompt}"
}
]
)
# Get headers in the right language
headers = await self._get_localized_headers(request_language_analysis.strip())
# Format the final result
final_result = f"## {headers['web_research_results']}\n\n### {headers['summary']}\n{final_summary}\n\n### {headers['detailed_results']}\n"
for i, result in enumerate(processed_results, 1):
final_result += f"\n\n[{i}] {result['title']}\n{headers['url']}: {result['url']}\n{headers['snippet']}: {result['snippet']}\n{headers['content']}: {result['summary']}"
# Set the content in the response
response["content"] = final_result
# Log completion - 100% progress
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Web research completed",
sender_id=self.id,
status="completed",
progress=1.0,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
logging_utils.info("Web research successfully completed", "agents")
return response
except json.JSONDecodeError as e:
logging_utils.error(f"Error parsing JSON data: {e}", "error")
# Fallback for JSON parse error
if log_func:
status_message = self.protocol.create_status_update_message(
status_description=f"Error parsing search strategy: {str(e)}",
sender_id=self.id,
status="error",
progress=0.0,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "error", self.id, self.name)
# Use a simple fallback approach
logging_utils.info("Using fallback search strategy with direct query", "agents")
# Perform a direct search with the original query
results = self.search_web(prompt)
# Process and format results directly
if results:
result_text = "## Web Research Results (Fallback Mode)\n\n"
for i, result in enumerate(results, 1):
result_text += f"### [{i}] {result['title']}\n"
result_text += f"URL: {result['url']}\n"
result_text += f"Snippet: {result['snippet']}\n\n"
response["content"] = result_text
else:
response["content"] = "## Web Research Results\n\nNo relevant results were found."
return response
except Exception as e:
error_msg = f"Error during web research: {str(e)}"
logging_utils.error(error_msg, "error")
# Create error response using protocol
error_message = self.protocol.create_error_message(
error_description=error_msg,
sender_id=self.id,
error_type="web_search",
error_details={"traceback": traceback.format_exc()},
context_id=workflow_id
)
# Log error status
if log_func:
status_message = self.protocol.create_status_update_message(
status_description=f"Error during web research: {str(e)}",
sender_id=self.id,
status="error",
progress=1.0,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "error", self.id, self.name)
response["content"] = f"## Error during web research\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"
return response
except Exception as e:
error_msg = f"Error during web research: {str(e)}"
logging_utils.error(error_msg, "error")
# Create error response using protocol
error_message = self.protocol.create_error_message(
error_description=error_msg,
sender_id=self.id,
error_type="web_search",
error_details={"traceback": traceback.format_exc()},
context_id=workflow_id
)
# Log error status
if log_func:
status_message = self.protocol.create_status_update_message(
status_description=f"Error during web research: {str(e)}",
sender_id=self.id,
status="error",
progress=1.0,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "error", self.id, self.name)
response["content"] = f"## Error during web research\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"
return response
async def _is_web_research_request_ai(self, prompt: str) -> bool:
"""
Uses AI to determine if a prompt requires web research, making it language-agnostic.
Args:
prompt: The user prompt
Returns:
True if this is explicitly a web research request, False otherwise
"""
if not self.ai_service:
# Fallback to simpler detection if AI service isn't available
return self._simple_web_detection(prompt)
try:
# Create a prompt to analyze whether this is a web research request
analysis_prompt = f"""
Analyze the following request and determine if it explicitly requires web research or online information.
REQUEST: {prompt}
A request requires web research if:
1. It explicitly asks to search for information online
2. It contains URLs or references to websites
3. It requests current information that would be available on the web
4. It asks to find information from web sources
5. It implicitly requires up-to-date information from the internet
ONLY respond with a single word - either "YES" if web research is required, or "NO" if it is not.
DO NOT include any explanation, just the answer YES or NO.
"""
# Call AI to analyze
response = await self.ai_service.call_api(
messages=[
{
"role": "system",
"content": "You determine if a request requires web research. Always answer with only YES or NO."
},
{
"role": "user",
"content": analysis_prompt
}
]
)
# Clean the response
response = response.strip().upper()
# Check if the response indicates it's a web research task
if "YES" in response:
return True
else:
return False
except Exception as e:
# Log error but don't fail, fall back to simpler detection
logger.warning(f"Error using AI to detect web research request: {str(e)}")
return self._simple_web_detection(prompt)
def _simple_web_detection(self, prompt: str) -> bool:
"""
Simpler fallback method to detect web research requests based on URLs.
Args:
prompt: The user prompt
Returns:
True if there are clear URL indicators, False otherwise
"""
# URLs in the prompt strongly indicate web research
url_indicators = ["http://", "https://", "www.", ".com", ".org", ".net", ".edu", ".gov"]
# Check for URL patterns in the prompt
contains_url = any(indicator in prompt.lower() for indicator in url_indicators)
return contains_url
async def _get_localized_headers(self, language: str) -> Dict[str, str]:
"""
Get localized headers for the web research results based on detected language.
Args:
language: The detected language
Returns:
Dictionary with localized headers
"""
# Default English headers
headers = {
"web_research_results": "Web Research Results",
"summary": "Summary",
"detailed_results": "Detailed Results",
"url": "URL",
"snippet": "Snippet",
"content": "Content"
}
# If language detection failed or is English, return defaults
if not language or language.lower() in ["english", "en"]:
return headers
try:
# Use AI to translate headers to the detected language
translation_prompt = f"""
Translate these web research result headers to {language}:
Web Research Results
Summary
Detailed Results
URL
Snippet
Content
Return a JSON object with these keys:
web_research_results, summary, detailed_results, url, snippet, content
"""
# Call AI for translation
response = await self.ai_service.call_api(
messages=[
{
"role": "system",
"content": "You translate headers to the specified language and return them as JSON."
},
{
"role": "user",
"content": translation_prompt
}
]
)
# Extract JSON
import re
json_match = re.search(r'\{.*\}', response, re.DOTALL)
if json_match:
translated_headers = json.loads(json_match.group(0))
return translated_headers
except Exception as e:
# Log error but continue with English headers
logger.warning(f"Error translating headers to {language}: {str(e)}")
return headers
async def get_prompt(self, message_context: Dict[str, Any]) -> str:
task = message_context.get("content", "")
return task.strip()
async def get_web_query(self, message_context: Dict[str, Any]) -> str:
prompt = await self.get_prompt(message_context)
result_json = await self.run_web_query(prompt)
result_data = ""
summary_src = ""
logger.info(f"Web analysis prompt '{prompt}' delivers {len(result_json)} results.")
if isinstance(result_json, list):
total_tokens = 0
for i, result in enumerate(result_json, 1):
# Limit content size for each result
result_data_limited = self.limit_text_for_api(result['data'], max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) # Allow ~15000 tokens per result
web_answer_instructions = f"""
Summarize this search result according to the original request in approximately 2000 characters. Original request = '{prompt.replace("'","")}'
Focus on the most important insights and connect them to the original request. You can skip any introduction.
Extract only relevant and high-quality information related to the request, and present it in a clear format. Provide a balanced view of the researched information.
Here is the search result:
{result_data_limited}
"""
# Count tokens in the instructions to ensure we don't exceed API limits
instruction_tokens = self.count_tokens(web_answer_instructions)
if total_tokens + instruction_tokens > 60000:
logger.warning(f"Skipping result {i} to avoid exceeding token limit")
break
total_tokens += instruction_tokens
# Additional instructions for web research
content_text = await self.ai_service.call_api(
messages=[
{
"role": "system",
"content": "You are an information analyst who precisely and relevantly summarizes web content."
},
{
"role": "user",
"content": web_answer_instructions
}
]
)
# Create a summary but ensure we stay within token limits
content_summary = content_text[:2000] # Limit to ~2000 characters
result_data += f"\n\n[{i}] {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\nContent: {content_summary}"
summary_src += f"\n{content_summary}"
# Update token count
total_tokens += self.count_tokens(content_summary) + 100 # Add buffer for formatting
else:
result_data = "no data received"
logger.info(f"Web analysis result sent {len(result_data)}B")
# Additional summary
summary = ""
if len(summary_src) > 1:
# Limit summary source to ensure we don't exceed API limits
summary_src_limited = self.limit_text_for_api(summary_src, max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))
summary = await self.ai_service.call_api(
messages=[
{
"role": "system",
"content": "You create concise summaries of research findings."
},
{
"role": "user",
"content": f"Please summarize these findings in 5-6 sentences: {summary_src_limited}\n"
}
]
)
# Format the final result
result = f"## Web Research Results\n\n### Summary\n{summary}\n\n### Detailed Results{result_data}"
return result
async def run_web_query(self, prompt: str) -> List[Dict]:
if prompt=="":
return []
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
"""
content_text = await self.ai_service.call_api(
messages=[
{
"role": "system",
"content": "You are a web research expert who develops precise search strategies."
},
{
"role": "user",
"content": ptext
}
]
)
# Remove markdown formatting if present
if content_text.startswith("```json"):
# Find the end of the JSON block
end_marker = "```"
end_index = content_text.rfind(end_marker)
if end_index != -1:
# Extract the JSON content without the markdown markers
content_text = content_text[7:end_index].strip()
# Now parse the JSON
try:
logger.info(f"Valid json received: {str(content_text)}")
pjson = json.loads(content_text)
# Now call scrape_json with the parsed dictionary
result_json = await self.scrape_json(pjson)
return result_json
except json.JSONDecodeError as e:
logger.error(f"Failed to parse JSON: {e}")
logger.error(f"Cleaned content: {content_text[:100]}...")
return []
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
"""
Scrapes web content based on a research strategy JSON.
Args:
research_strategy: A dictionary containing:
- 'skey': List of search keywords
- 'url': List of direct URLs to scrape
Returns:
Dictionary with URLs as keys and scraped content as values
"""
logger.info("Starting JSON-based web scraping")
results = []
# Validate input structure
if not isinstance(research_strategy, dict):
logger.error("Invalid research_strategy format: not a dictionary")
return {"error": "Invalid research_strategy format: not a dictionary"}
keys = research_strategy.get("skey", [])
direct_urls = research_strategy.get("url", [])
if not isinstance(keys, list) or not isinstance(direct_urls, list):
logger.error("Invalid research_strategy format: keys, or url is not a list")
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
# Process search keywords through search engine
for keyword in keys:
logger.info(f"Processing keyword: {keyword}")
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
logger.info(f"... {len(found_results)} results found")
results.extend(found_results)
# Process direct URLs
logger.info(f"Processing {len(direct_urls)} direct URLs")
for url in direct_urls:
if url in results:
logger.info(f"Skipping already scraped URL: {url}")
continue
soup = self.read_url(url)
# Extract title from the page if it exists
if isinstance(soup, BeautifulSoup):
title_tag = soup.find('title')
title = title_tag.text.strip() if title_tag else "No title"
# Alternative: You could also look for h1 tags if the title tag is missing
if title == "No title":
h1_tag = soup.find('h1')
if h1_tag:
title = h1_tag.text.strip()
else:
# Handle the case where soup is an error message string
title = "Error fetching page"
results.append(self.parse_result(soup, title, url))
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
return results
def extract_main_content(self, soup: BeautifulSoup, max_chars: int = int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) -> str:
"""
Extract the main content from an HTML page while limiting character count.
Args:
soup: BeautifulSoup object containing the page content
max_chars: Maximum number of characters to extract
Returns:
Extracted main content as string
"""
if not isinstance(soup, BeautifulSoup):
return str(soup)[:max_chars]
# Try to find main content elements in order of priority
main_content = None
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
content = soup.select_one(selector)
if content:
main_content = content
break
# If no main content found, use the body
if not main_content:
main_content = soup.find('body') or soup
# Remove script, style, nav, footer elements that don't contribute to main content
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
element.extract()
# Extract text content
text_content = main_content.get_text(separator=' ', strip=True)
# Limit to max_chars
return text_content[:max_chars]
def tokenize_for_counting(self, text: str) -> List[str]:
"""
Simple token counter for estimating token usage.
This is an approximation since the exact tokenization depends on the model.
Args:
text: Input text
Returns:
List of tokens
"""
# Simple tokenization by splitting on whitespace and punctuation
import re
return re.findall(r'\w+|[^\w\s]', text)
def count_tokens(self, text: str) -> int:
"""
Count the approximate number of tokens in a text.
Args:
text: Input text
Returns:
Estimated token count
"""
tokens = self.tokenize_for_counting(text)
return len(tokens)
def limit_text_for_api(self, text: str, max_tokens: int = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_TOKENS"))) -> str:
"""
Limit the text to a maximum number of tokens.
Args:
text: Input text
max_tokens: Maximum number of tokens allowed
Returns:
Limited text
"""
if not text:
return ""
tokens = self.tokenize_for_counting(text)
# If text is already under the limit, return as is
if len(tokens) <= max_tokens:
return text
# Otherwise, truncate text to max_tokens
return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]"
def search_web(self, query: str) -> List[Dict]:
formatted_query = quote_plus(query)
url = f"{APP_CONFIG("Connector_AiWebscraping_SEARCH_ENGINE")}{formatted_query}"
search_results_soup = self.read_url(url)
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
logger.warning(f"No search results found for: {query}")
return []
# Extract search results
results = []
# Find all result containers
result_elements = search_results_soup.select('.result')
for result in result_elements:
# Extract title
title_element = result.select_one('.result__a')
title = title_element.text.strip() if title_element else 'No title'
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
url_element = title_element.get('href') if title_element else ''
extracted_url = 'No URL'
if url_element:
# Extract the actual URL from DuckDuckGo's redirect
if url_element.startswith('/d.js?q='):
start = url_element.find('?q=') + 3 # Skip '?q='
end = url_element.find('&', start) if '&' in url_element[start:] else None
extracted_url = unquote(url_element[start:end])
# Make sure the URL has the correct protocol prefix
if not extracted_url.startswith(('http://', 'https://')):
if not extracted_url.startswith('//'):
extracted_url = 'https://' + extracted_url
else:
extracted_url = 'https:' + extracted_url
else:
extracted_url = url_element
# Extract snippet directly from search results page
snippet_element = result.select_one('.result__snippet')
snippet = snippet_element.text.strip() if snippet_element else 'No description'
# Now fetch the actual page content for the data field
target_page_soup = self.read_url(extracted_url)
# Use the new content extraction method to limit content size
content = self.extract_main_content(target_page_soup, max_chars=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))
results.append({
'title': title,
'url': extracted_url,
'snippet': snippet,
'data': content
})
# Limit the number of results if needed
if len(results) >= self.max_result:
break
return results
def read_url(self, url: str) -> BeautifulSoup:
"""
Reads a URL and returns a BeautifulSoup parser for the content.
Returns an empty BeautifulSoup object for errors.
Args:
url: The URL to read
Returns:
BeautifulSoup object with the content or empty for errors
"""
headers = {
'User-Agent': APP_CONFIG("Connector_AiWebscraping_USER_AGENT"),
'Accept': 'text/html,application/xhtml+xml,application/xml',
'Accept-Language': 'en-US,en;q=0.9',
}
try:
# Initial request
response = requests.get(url, headers=headers, timeout=int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT")))
# Polling for status 202
if response.status_code == 202:
# Maximum 3 attempts with increasing intervals
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, then 1s, then 2s
for wait_time in backoff_times:
time.sleep(wait_time) # Wait with increasing time
response = requests.get(url, headers=headers, timeout=int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT")))
# If no 202 anymore, then break
if response.status_code != 202:
break
# For other error statuses, raise an error
response.raise_for_status()
# Parse HTML
return BeautifulSoup(response.text, 'html.parser')
except Exception as e:
# Create empty BeautifulSoup object
return BeautifulSoup("<html><body></body></html>", 'html.parser')
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
"""
Parse a BeautifulSoup object into a result dictionary.
Args:
data: BeautifulSoup object containing the page content
title: Page title
url: Page URL
Returns:
Dictionary with result data
"""
# Extract content using the main content extraction method
content = self.extract_main_content(data, max_chars=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))
result = {
'title': title,
'url': url,
'snippet': 'No description', # Default value
'data': content
}
return result
# Singleton instance
_webcrawler_agent = None
def get_webcrawler_agent():
"""Returns a singleton instance of the WebCrawler Agent"""
global _webcrawler_agent
if _webcrawler_agent is None:
_webcrawler_agent = WebcrawlerAgent()
return _webcrawler_agent