1056 lines
No EOL
46 KiB
Python
1056 lines
No EOL
46 KiB
Python
"""
|
|
WebCrawler-Agent for research and retrieval of information from the web.
|
|
Adapted for the refactored Core-Module with language-agnostic detection.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
import traceback
|
|
from typing import List, Dict, Any, Optional
|
|
from urllib.parse import quote_plus, unquote
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
from modules.agentservice_base import BaseAgent
|
|
from modules.agentservice_utils import MessageUtils, LoggingUtils
|
|
from modules.agentservice_protocol import AgentCommunicationProtocol
|
|
from modules.utility import APP_CONFIG
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class WebcrawlerAgent(BaseAgent):
|
|
|
|
"""Agent for Web Research and Information Retrieval"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the WebCrawler Agent"""
|
|
super().__init__()
|
|
self.id = "webcrawler"
|
|
self.name = "Webscraper"
|
|
self.type = "scraper"
|
|
self.description = "Researches information on the web"
|
|
self.capabilities = "web_search,information_retrieval,data_collection,source_verification,content_integration"
|
|
self.result_format = "SearchResults"
|
|
|
|
# Add enhanced document capabilities
|
|
self.supports_documents = True
|
|
self.document_capabilities = ["read", "create"]
|
|
self.required_context = ["workflow_id"]
|
|
self.document_handler = None
|
|
|
|
# Initialize AI service
|
|
self.ai_service = None
|
|
|
|
# Initialize protocol
|
|
self.protocol = AgentCommunicationProtocol()
|
|
|
|
# Initialize utility classes
|
|
self.message_utils = MessageUtils()
|
|
|
|
# Web-Crawling configuration
|
|
self.max_url = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_URLS"))
|
|
self.max_key = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_SEARCH_KEYWORDS"))
|
|
self.max_result = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_SEARCH_RESULTS"))
|
|
self.timeout = int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT"))
|
|
|
|
def get_agent_info(self) -> Dict[str, Any]:
|
|
"""Get agent information for agent registry"""
|
|
info = super().get_agent_info()
|
|
info.update({
|
|
"metadata": {
|
|
"max_url": self.max_url,
|
|
"max_result": self.max_result,
|
|
"timeout": self.timeout
|
|
}
|
|
})
|
|
return info
|
|
|
|
def set_document_handler(self, document_handler):
|
|
"""Set the document handler for file operations"""
|
|
self.document_handler = document_handler
|
|
|
|
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
|
"""
|
|
Process a message and conduct web research if appropriate.
|
|
|
|
Args:
|
|
message: The message to process
|
|
context: Additional context
|
|
|
|
Returns:
|
|
The generated response or rejection if not a web research request
|
|
"""
|
|
# Extract workflow_id from context or message
|
|
workflow_id = context.get("workflow_id") if context else message.get("workflow_id", "unknown")
|
|
|
|
# Get or create logging_utils
|
|
log_func = context.get("log_func") if context else None
|
|
logging_utils = LoggingUtils(workflow_id, log_func)
|
|
|
|
# Create response structure
|
|
response = {
|
|
"role": "assistant",
|
|
"content": "",
|
|
"agent_id": self.id,
|
|
"agent_type": self.type,
|
|
"agent_name": self.name,
|
|
"result_format": self.result_format,
|
|
"workflow_id": workflow_id
|
|
}
|
|
|
|
try:
|
|
# Get the query from the message
|
|
prompt = await self.get_prompt(message)
|
|
|
|
# Check if this is explicitly a web research request using AI
|
|
is_web_research = await self._is_web_research_request_ai(prompt)
|
|
|
|
if not is_web_research:
|
|
# Reject non-web research requests
|
|
logging_utils.info("Request rejected: not a web research task", "agents")
|
|
response["content"] = "This request doesn't appear to require web research. Redirecting to a more appropriate agent."
|
|
response["status"] = "rejected"
|
|
response["rejection_reason"] = "not_web_research"
|
|
return response
|
|
|
|
# Continue with web research process
|
|
logging_utils.info(f"Web research for: {prompt[:50]}...", "agents")
|
|
|
|
# Send status update using protocol
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description="Starting web research",
|
|
sender_id=self.id,
|
|
status="in_progress",
|
|
progress=0.0,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
# Update progress using protocol - 10% for starting the query analysis
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description=f"Analyzing search strategy for: {prompt[:30]}...",
|
|
sender_id=self.id,
|
|
status="in_progress",
|
|
progress=0.1,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
# Prepare the web query strategy
|
|
try:
|
|
# Log progress - 20% for query strategy preparation
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description="Creating search strategy",
|
|
sender_id=self.id,
|
|
status="in_progress",
|
|
progress=0.2,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
# Get the query strategy
|
|
content_text = await self.ai_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You are a web research expert who develops precise search strategies."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
|
|
|
|
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
|
|
|
|
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
|
|
|
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
|
|
"""
|
|
}
|
|
]
|
|
)
|
|
|
|
# Try to parse the JSON result
|
|
if content_text.startswith("```json"):
|
|
# Find the end of the JSON block
|
|
end_marker = "```"
|
|
end_index = content_text.rfind(end_marker)
|
|
if end_index != -1:
|
|
# Extract the JSON content without the markdown markers
|
|
content_text = content_text[7:end_index].strip()
|
|
|
|
try:
|
|
logger.info(f"Valid json received: {str(content_text)}")
|
|
pjson = json.loads(content_text)
|
|
|
|
# Log parsed search strategy
|
|
search_keys = pjson.get("skey", [])
|
|
search_urls = pjson.get("url", [])
|
|
|
|
if search_keys:
|
|
logging_utils.info(f"Searching for {len(search_keys)} key terms: {', '.join(search_keys[:2])}...", "agents")
|
|
|
|
if search_urls:
|
|
logging_utils.info(f"Searching in {len(search_urls)} direct URLs: {', '.join(search_urls[:2])}...", "agents")
|
|
|
|
# Log progress - 30% for starting the search
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description="Starting web search",
|
|
sender_id=self.id,
|
|
status="in_progress",
|
|
progress=0.3,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
# Execute the search
|
|
results = []
|
|
total_tasks = len(search_keys) + len(search_urls)
|
|
tasks_completed = 0
|
|
|
|
# Process search keywords
|
|
for keyword in search_keys:
|
|
logging_utils.info(f"Searching web for: '{keyword}'", "agents")
|
|
|
|
# Log specific keyword search progress
|
|
if log_func:
|
|
progress_pct = 0.3 + (0.5 * (tasks_completed / total_tasks))
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description=f"Searching for: '{keyword}'",
|
|
sender_id=self.id,
|
|
status="in_progress",
|
|
progress=progress_pct,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
keyword_results = self.search_web(keyword)
|
|
results.extend(keyword_results)
|
|
logging_utils.info(f"Found: {len(keyword_results)} results for '{keyword}'", "agents")
|
|
|
|
tasks_completed += 1
|
|
|
|
# Process direct URLs
|
|
for url in search_urls:
|
|
logging_utils.info(f"Extracting content from: {url}", "agents")
|
|
|
|
# Log specific URL extraction progress
|
|
if log_func:
|
|
progress_pct = 0.3 + (0.5 * (tasks_completed / total_tasks))
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description=f"Reading URL: {url}",
|
|
sender_id=self.id,
|
|
status="in_progress",
|
|
progress=progress_pct,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
soup = self.read_url(url)
|
|
|
|
# Extract title from the page if it exists
|
|
if isinstance(soup, BeautifulSoup):
|
|
title_tag = soup.find('title')
|
|
title = title_tag.text.strip() if title_tag else "No title"
|
|
|
|
# Alternative: You could also look for h1 tags if the title tag is missing
|
|
if title == "No title":
|
|
h1_tag = soup.find('h1')
|
|
if h1_tag:
|
|
title = h1_tag.text.strip()
|
|
else:
|
|
# Handle the case where soup is an error message string
|
|
title = "Error fetching page"
|
|
|
|
result = self.parse_result(soup, title, url)
|
|
results.append(result)
|
|
logging_utils.info(f"Extracted: '{title}' from {url}", "agents")
|
|
|
|
tasks_completed += 1
|
|
|
|
# Log progress - 80% for processing results
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description=f"Analyzing {len(results)} search results",
|
|
sender_id=self.id,
|
|
status="in_progress",
|
|
progress=0.8,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
# Process results for the final output
|
|
logging_utils.info(f"Analyzing {len(results)} web results", "agents")
|
|
|
|
# Generate summaries for each result
|
|
processed_results = []
|
|
for i, result in enumerate(results):
|
|
result_data_limited = self.limit_text_for_api(result['data'], max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))
|
|
|
|
# Log individual result processing
|
|
logging_utils.info(f"Analyzing result {i+1}/{len(results)}: {result['title'][:30]}...", "agents")
|
|
|
|
web_answer_instructions = f"""
|
|
Summarize this search result according to the original request in approximately 2000 characters. Original request = '{prompt.replace("'","")}'
|
|
Focus on the most important insights and connect them to the original request. You can skip any introduction.
|
|
Extract only relevant and high-quality information related to the request, and present it in a clear format. Provide a balanced view of the researched information.
|
|
|
|
Here is the search result:
|
|
{result_data_limited}
|
|
"""
|
|
|
|
content_summary = await self.ai_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You are an information analyst who precisely and relevantly summarizes web content."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": web_answer_instructions
|
|
}
|
|
]
|
|
)
|
|
|
|
# Limit summary to ~2000 characters
|
|
content_summary = content_summary[:2000]
|
|
|
|
processed_result = {
|
|
"title": result['title'],
|
|
"url": result['url'],
|
|
"snippet": result['snippet'],
|
|
"summary": content_summary
|
|
}
|
|
|
|
processed_results.append(processed_result)
|
|
|
|
# Log progress - 90% for creating final summary
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description="Creating overall summary",
|
|
sender_id=self.id,
|
|
status="in_progress",
|
|
progress=0.9,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
# Create the final combined summary
|
|
all_summaries = "\n\n".join([r["summary"] for r in processed_results])
|
|
all_summaries_limited = self.limit_text_for_api(all_summaries, max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))
|
|
|
|
logging_utils.info("Creating overall summary of web research", "agents")
|
|
|
|
final_summary = await self.ai_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You create concise summaries of research findings."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Please summarize these findings in 5-6 sentences: {all_summaries_limited}\n"
|
|
}
|
|
]
|
|
)
|
|
|
|
# Get the language of the request to use for result headers
|
|
request_language_analysis = await self.ai_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You determine the language of a text and return only the language name."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"What language is this text in? Only respond with the language name: {prompt}"
|
|
}
|
|
]
|
|
)
|
|
|
|
# Get headers in the right language
|
|
headers = await self._get_localized_headers(request_language_analysis.strip())
|
|
|
|
# Format the final result
|
|
final_result = f"## {headers['web_research_results']}\n\n### {headers['summary']}\n{final_summary}\n\n### {headers['detailed_results']}\n"
|
|
|
|
for i, result in enumerate(processed_results, 1):
|
|
final_result += f"\n\n[{i}] {result['title']}\n{headers['url']}: {result['url']}\n{headers['snippet']}: {result['snippet']}\n{headers['content']}: {result['summary']}"
|
|
|
|
# Set the content in the response
|
|
response["content"] = final_result
|
|
|
|
# Log completion - 100% progress
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description="Web research completed",
|
|
sender_id=self.id,
|
|
status="completed",
|
|
progress=1.0,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
logging_utils.info("Web research successfully completed", "agents")
|
|
|
|
return response
|
|
|
|
except json.JSONDecodeError as e:
|
|
logging_utils.error(f"Error parsing JSON data: {e}", "error")
|
|
|
|
# Fallback for JSON parse error
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description=f"Error parsing search strategy: {str(e)}",
|
|
sender_id=self.id,
|
|
status="error",
|
|
progress=0.0,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "error", self.id, self.name)
|
|
|
|
# Use a simple fallback approach
|
|
logging_utils.info("Using fallback search strategy with direct query", "agents")
|
|
|
|
# Perform a direct search with the original query
|
|
results = self.search_web(prompt)
|
|
|
|
# Process and format results directly
|
|
if results:
|
|
result_text = "## Web Research Results (Fallback Mode)\n\n"
|
|
|
|
for i, result in enumerate(results, 1):
|
|
result_text += f"### [{i}] {result['title']}\n"
|
|
result_text += f"URL: {result['url']}\n"
|
|
result_text += f"Snippet: {result['snippet']}\n\n"
|
|
|
|
response["content"] = result_text
|
|
else:
|
|
response["content"] = "## Web Research Results\n\nNo relevant results were found."
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error during web research: {str(e)}"
|
|
logging_utils.error(error_msg, "error")
|
|
|
|
# Create error response using protocol
|
|
error_message = self.protocol.create_error_message(
|
|
error_description=error_msg,
|
|
sender_id=self.id,
|
|
error_type="web_search",
|
|
error_details={"traceback": traceback.format_exc()},
|
|
context_id=workflow_id
|
|
)
|
|
|
|
# Log error status
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description=f"Error during web research: {str(e)}",
|
|
sender_id=self.id,
|
|
status="error",
|
|
progress=1.0,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "error", self.id, self.name)
|
|
|
|
response["content"] = f"## Error during web research\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error during web research: {str(e)}"
|
|
logging_utils.error(error_msg, "error")
|
|
|
|
# Create error response using protocol
|
|
error_message = self.protocol.create_error_message(
|
|
error_description=error_msg,
|
|
sender_id=self.id,
|
|
error_type="web_search",
|
|
error_details={"traceback": traceback.format_exc()},
|
|
context_id=workflow_id
|
|
)
|
|
|
|
# Log error status
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description=f"Error during web research: {str(e)}",
|
|
sender_id=self.id,
|
|
status="error",
|
|
progress=1.0,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "error", self.id, self.name)
|
|
|
|
response["content"] = f"## Error during web research\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"
|
|
|
|
return response
|
|
|
|
async def _is_web_research_request_ai(self, prompt: str) -> bool:
|
|
"""
|
|
Uses AI to determine if a prompt requires web research, making it language-agnostic.
|
|
|
|
Args:
|
|
prompt: The user prompt
|
|
|
|
Returns:
|
|
True if this is explicitly a web research request, False otherwise
|
|
"""
|
|
if not self.ai_service:
|
|
# Fallback to simpler detection if AI service isn't available
|
|
return self._simple_web_detection(prompt)
|
|
|
|
try:
|
|
# Create a prompt to analyze whether this is a web research request
|
|
analysis_prompt = f"""
|
|
Analyze the following request and determine if it explicitly requires web research or online information.
|
|
|
|
REQUEST: {prompt}
|
|
|
|
A request requires web research if:
|
|
1. It explicitly asks to search for information online
|
|
2. It contains URLs or references to websites
|
|
3. It requests current information that would be available on the web
|
|
4. It asks to find information from web sources
|
|
5. It implicitly requires up-to-date information from the internet
|
|
|
|
ONLY respond with a single word - either "YES" if web research is required, or "NO" if it is not.
|
|
DO NOT include any explanation, just the answer YES or NO.
|
|
"""
|
|
|
|
# Call AI to analyze
|
|
response = await self.ai_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You determine if a request requires web research. Always answer with only YES or NO."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": analysis_prompt
|
|
}
|
|
]
|
|
)
|
|
|
|
# Clean the response
|
|
response = response.strip().upper()
|
|
|
|
# Check if the response indicates it's a web research task
|
|
if "YES" in response:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
except Exception as e:
|
|
# Log error but don't fail, fall back to simpler detection
|
|
logger.warning(f"Error using AI to detect web research request: {str(e)}")
|
|
return self._simple_web_detection(prompt)
|
|
|
|
def _simple_web_detection(self, prompt: str) -> bool:
|
|
"""
|
|
Simpler fallback method to detect web research requests based on URLs.
|
|
|
|
Args:
|
|
prompt: The user prompt
|
|
|
|
Returns:
|
|
True if there are clear URL indicators, False otherwise
|
|
"""
|
|
# URLs in the prompt strongly indicate web research
|
|
url_indicators = ["http://", "https://", "www.", ".com", ".org", ".net", ".edu", ".gov"]
|
|
|
|
# Check for URL patterns in the prompt
|
|
contains_url = any(indicator in prompt.lower() for indicator in url_indicators)
|
|
|
|
return contains_url
|
|
|
|
async def _get_localized_headers(self, language: str) -> Dict[str, str]:
|
|
"""
|
|
Get localized headers for the web research results based on detected language.
|
|
|
|
Args:
|
|
language: The detected language
|
|
|
|
Returns:
|
|
Dictionary with localized headers
|
|
"""
|
|
# Default English headers
|
|
headers = {
|
|
"web_research_results": "Web Research Results",
|
|
"summary": "Summary",
|
|
"detailed_results": "Detailed Results",
|
|
"url": "URL",
|
|
"snippet": "Snippet",
|
|
"content": "Content"
|
|
}
|
|
|
|
# If language detection failed or is English, return defaults
|
|
if not language or language.lower() in ["english", "en"]:
|
|
return headers
|
|
|
|
try:
|
|
# Use AI to translate headers to the detected language
|
|
translation_prompt = f"""
|
|
Translate these web research result headers to {language}:
|
|
|
|
Web Research Results
|
|
Summary
|
|
Detailed Results
|
|
URL
|
|
Snippet
|
|
Content
|
|
|
|
Return a JSON object with these keys:
|
|
web_research_results, summary, detailed_results, url, snippet, content
|
|
"""
|
|
|
|
# Call AI for translation
|
|
response = await self.ai_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You translate headers to the specified language and return them as JSON."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": translation_prompt
|
|
}
|
|
]
|
|
)
|
|
|
|
# Extract JSON
|
|
import re
|
|
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
|
|
|
if json_match:
|
|
translated_headers = json.loads(json_match.group(0))
|
|
return translated_headers
|
|
|
|
except Exception as e:
|
|
# Log error but continue with English headers
|
|
logger.warning(f"Error translating headers to {language}: {str(e)}")
|
|
|
|
return headers
|
|
|
|
async def get_prompt(self, message_context: Dict[str, Any]) -> str:
|
|
task = message_context.get("content", "")
|
|
return task.strip()
|
|
|
|
async def get_web_query(self, message_context: Dict[str, Any]) -> str:
|
|
prompt = await self.get_prompt(message_context)
|
|
result_json = await self.run_web_query(prompt)
|
|
result_data = ""
|
|
summary_src = ""
|
|
|
|
logger.info(f"Web analysis prompt '{prompt}' delivers {len(result_json)} results.")
|
|
if isinstance(result_json, list):
|
|
total_tokens = 0
|
|
|
|
for i, result in enumerate(result_json, 1):
|
|
# Limit content size for each result
|
|
result_data_limited = self.limit_text_for_api(result['data'], max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) # Allow ~15000 tokens per result
|
|
|
|
web_answer_instructions = f"""
|
|
Summarize this search result according to the original request in approximately 2000 characters. Original request = '{prompt.replace("'","")}'
|
|
Focus on the most important insights and connect them to the original request. You can skip any introduction.
|
|
Extract only relevant and high-quality information related to the request, and present it in a clear format. Provide a balanced view of the researched information.
|
|
|
|
Here is the search result:
|
|
{result_data_limited}
|
|
"""
|
|
|
|
# Count tokens in the instructions to ensure we don't exceed API limits
|
|
instruction_tokens = self.count_tokens(web_answer_instructions)
|
|
if total_tokens + instruction_tokens > 60000:
|
|
logger.warning(f"Skipping result {i} to avoid exceeding token limit")
|
|
break
|
|
|
|
total_tokens += instruction_tokens
|
|
|
|
# Additional instructions for web research
|
|
content_text = await self.ai_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You are an information analyst who precisely and relevantly summarizes web content."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": web_answer_instructions
|
|
}
|
|
]
|
|
)
|
|
|
|
# Create a summary but ensure we stay within token limits
|
|
content_summary = content_text[:2000] # Limit to ~2000 characters
|
|
result_data += f"\n\n[{i}] {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\nContent: {content_summary}"
|
|
summary_src += f"\n{content_summary}"
|
|
|
|
# Update token count
|
|
total_tokens += self.count_tokens(content_summary) + 100 # Add buffer for formatting
|
|
else:
|
|
result_data = "no data received"
|
|
|
|
logger.info(f"Web analysis result sent {len(result_data)}B")
|
|
|
|
# Additional summary
|
|
summary = ""
|
|
if len(summary_src) > 1:
|
|
# Limit summary source to ensure we don't exceed API limits
|
|
summary_src_limited = self.limit_text_for_api(summary_src, max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))
|
|
|
|
summary = await self.ai_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You create concise summaries of research findings."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Please summarize these findings in 5-6 sentences: {summary_src_limited}\n"
|
|
}
|
|
]
|
|
)
|
|
|
|
# Format the final result
|
|
result = f"## Web Research Results\n\n### Summary\n{summary}\n\n### Detailed Results{result_data}"
|
|
return result
|
|
|
|
async def run_web_query(self, prompt: str) -> List[Dict]:
|
|
if prompt=="":
|
|
return []
|
|
|
|
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
|
|
|
|
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
|
|
|
|
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
|
|
|
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
|
|
"""
|
|
|
|
content_text = await self.ai_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You are a web research expert who develops precise search strategies."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": ptext
|
|
}
|
|
]
|
|
)
|
|
# Remove markdown formatting if present
|
|
if content_text.startswith("```json"):
|
|
# Find the end of the JSON block
|
|
end_marker = "```"
|
|
end_index = content_text.rfind(end_marker)
|
|
if end_index != -1:
|
|
# Extract the JSON content without the markdown markers
|
|
content_text = content_text[7:end_index].strip()
|
|
|
|
# Now parse the JSON
|
|
try:
|
|
logger.info(f"Valid json received: {str(content_text)}")
|
|
pjson = json.loads(content_text)
|
|
# Now call scrape_json with the parsed dictionary
|
|
result_json = await self.scrape_json(pjson)
|
|
return result_json
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to parse JSON: {e}")
|
|
logger.error(f"Cleaned content: {content_text[:100]}...")
|
|
return []
|
|
|
|
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
|
|
"""
|
|
Scrapes web content based on a research strategy JSON.
|
|
|
|
Args:
|
|
research_strategy: A dictionary containing:
|
|
- 'skey': List of search keywords
|
|
- 'url': List of direct URLs to scrape
|
|
|
|
Returns:
|
|
Dictionary with URLs as keys and scraped content as values
|
|
"""
|
|
|
|
logger.info("Starting JSON-based web scraping")
|
|
results = []
|
|
|
|
# Validate input structure
|
|
if not isinstance(research_strategy, dict):
|
|
logger.error("Invalid research_strategy format: not a dictionary")
|
|
return {"error": "Invalid research_strategy format: not a dictionary"}
|
|
|
|
keys = research_strategy.get("skey", [])
|
|
direct_urls = research_strategy.get("url", [])
|
|
|
|
if not isinstance(keys, list) or not isinstance(direct_urls, list):
|
|
logger.error("Invalid research_strategy format: keys, or url is not a list")
|
|
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
|
|
|
|
# Process search keywords through search engine
|
|
for keyword in keys:
|
|
logger.info(f"Processing keyword: {keyword}")
|
|
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
|
|
logger.info(f"... {len(found_results)} results found")
|
|
results.extend(found_results)
|
|
|
|
# Process direct URLs
|
|
logger.info(f"Processing {len(direct_urls)} direct URLs")
|
|
for url in direct_urls:
|
|
if url in results:
|
|
logger.info(f"Skipping already scraped URL: {url}")
|
|
continue
|
|
soup = self.read_url(url)
|
|
|
|
# Extract title from the page if it exists
|
|
if isinstance(soup, BeautifulSoup):
|
|
title_tag = soup.find('title')
|
|
title = title_tag.text.strip() if title_tag else "No title"
|
|
|
|
# Alternative: You could also look for h1 tags if the title tag is missing
|
|
if title == "No title":
|
|
h1_tag = soup.find('h1')
|
|
if h1_tag:
|
|
title = h1_tag.text.strip()
|
|
else:
|
|
# Handle the case where soup is an error message string
|
|
title = "Error fetching page"
|
|
|
|
results.append(self.parse_result(soup, title, url))
|
|
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
|
|
return results
|
|
|
|
def extract_main_content(self, soup: BeautifulSoup, max_chars: int = int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) -> str:
|
|
"""
|
|
Extract the main content from an HTML page while limiting character count.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object containing the page content
|
|
max_chars: Maximum number of characters to extract
|
|
|
|
Returns:
|
|
Extracted main content as string
|
|
"""
|
|
if not isinstance(soup, BeautifulSoup):
|
|
return str(soup)[:max_chars]
|
|
|
|
# Try to find main content elements in order of priority
|
|
main_content = None
|
|
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
|
|
content = soup.select_one(selector)
|
|
if content:
|
|
main_content = content
|
|
break
|
|
|
|
# If no main content found, use the body
|
|
if not main_content:
|
|
main_content = soup.find('body') or soup
|
|
|
|
# Remove script, style, nav, footer elements that don't contribute to main content
|
|
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
|
|
element.extract()
|
|
|
|
# Extract text content
|
|
text_content = main_content.get_text(separator=' ', strip=True)
|
|
|
|
# Limit to max_chars
|
|
return text_content[:max_chars]
|
|
|
|
def tokenize_for_counting(self, text: str) -> List[str]:
|
|
"""
|
|
Simple token counter for estimating token usage.
|
|
This is an approximation since the exact tokenization depends on the model.
|
|
|
|
Args:
|
|
text: Input text
|
|
|
|
Returns:
|
|
List of tokens
|
|
"""
|
|
# Simple tokenization by splitting on whitespace and punctuation
|
|
import re
|
|
return re.findall(r'\w+|[^\w\s]', text)
|
|
|
|
def count_tokens(self, text: str) -> int:
|
|
"""
|
|
Count the approximate number of tokens in a text.
|
|
|
|
Args:
|
|
text: Input text
|
|
|
|
Returns:
|
|
Estimated token count
|
|
"""
|
|
tokens = self.tokenize_for_counting(text)
|
|
return len(tokens)
|
|
|
|
def limit_text_for_api(self, text: str, max_tokens: int = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_TOKENS"))) -> str:
|
|
"""
|
|
Limit the text to a maximum number of tokens.
|
|
|
|
Args:
|
|
text: Input text
|
|
max_tokens: Maximum number of tokens allowed
|
|
|
|
Returns:
|
|
Limited text
|
|
"""
|
|
if not text:
|
|
return ""
|
|
|
|
tokens = self.tokenize_for_counting(text)
|
|
|
|
# If text is already under the limit, return as is
|
|
if len(tokens) <= max_tokens:
|
|
return text
|
|
|
|
# Otherwise, truncate text to max_tokens
|
|
return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]"
|
|
|
|
def search_web(self, query: str) -> List[Dict]:
|
|
formatted_query = quote_plus(query)
|
|
url = f"{APP_CONFIG("Connector_AiWebscraping_SEARCH_ENGINE")}{formatted_query}"
|
|
|
|
search_results_soup = self.read_url(url)
|
|
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
|
|
logger.warning(f"No search results found for: {query}")
|
|
return []
|
|
|
|
# Extract search results
|
|
results = []
|
|
|
|
# Find all result containers
|
|
result_elements = search_results_soup.select('.result')
|
|
|
|
for result in result_elements:
|
|
# Extract title
|
|
title_element = result.select_one('.result__a')
|
|
title = title_element.text.strip() if title_element else 'No title'
|
|
|
|
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
|
|
url_element = title_element.get('href') if title_element else ''
|
|
extracted_url = 'No URL'
|
|
|
|
if url_element:
|
|
# Extract the actual URL from DuckDuckGo's redirect
|
|
if url_element.startswith('/d.js?q='):
|
|
start = url_element.find('?q=') + 3 # Skip '?q='
|
|
end = url_element.find('&', start) if '&' in url_element[start:] else None
|
|
extracted_url = unquote(url_element[start:end])
|
|
|
|
# Make sure the URL has the correct protocol prefix
|
|
if not extracted_url.startswith(('http://', 'https://')):
|
|
if not extracted_url.startswith('//'):
|
|
extracted_url = 'https://' + extracted_url
|
|
else:
|
|
extracted_url = 'https:' + extracted_url
|
|
else:
|
|
extracted_url = url_element
|
|
|
|
# Extract snippet directly from search results page
|
|
snippet_element = result.select_one('.result__snippet')
|
|
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
|
|
|
# Now fetch the actual page content for the data field
|
|
target_page_soup = self.read_url(extracted_url)
|
|
|
|
# Use the new content extraction method to limit content size
|
|
content = self.extract_main_content(target_page_soup, max_chars=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))
|
|
|
|
results.append({
|
|
'title': title,
|
|
'url': extracted_url,
|
|
'snippet': snippet,
|
|
'data': content
|
|
})
|
|
|
|
# Limit the number of results if needed
|
|
if len(results) >= self.max_result:
|
|
break
|
|
|
|
return results
|
|
|
|
def read_url(self, url: str) -> BeautifulSoup:
|
|
"""
|
|
Reads a URL and returns a BeautifulSoup parser for the content.
|
|
Returns an empty BeautifulSoup object for errors.
|
|
|
|
Args:
|
|
url: The URL to read
|
|
|
|
Returns:
|
|
BeautifulSoup object with the content or empty for errors
|
|
"""
|
|
headers = {
|
|
'User-Agent': APP_CONFIG("Connector_AiWebscraping_USER_AGENT"),
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
}
|
|
|
|
try:
|
|
# Initial request
|
|
response = requests.get(url, headers=headers, timeout=int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT")))
|
|
|
|
# Polling for status 202
|
|
if response.status_code == 202:
|
|
# Maximum 3 attempts with increasing intervals
|
|
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, then 1s, then 2s
|
|
|
|
for wait_time in backoff_times:
|
|
time.sleep(wait_time) # Wait with increasing time
|
|
response = requests.get(url, headers=headers, timeout=int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT")))
|
|
|
|
# If no 202 anymore, then break
|
|
if response.status_code != 202:
|
|
break
|
|
|
|
# For other error statuses, raise an error
|
|
response.raise_for_status()
|
|
|
|
# Parse HTML
|
|
return BeautifulSoup(response.text, 'html.parser')
|
|
|
|
except Exception as e:
|
|
# Create empty BeautifulSoup object
|
|
return BeautifulSoup("<html><body></body></html>", 'html.parser')
|
|
|
|
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
|
|
"""
|
|
Parse a BeautifulSoup object into a result dictionary.
|
|
|
|
Args:
|
|
data: BeautifulSoup object containing the page content
|
|
title: Page title
|
|
url: Page URL
|
|
|
|
Returns:
|
|
Dictionary with result data
|
|
"""
|
|
# Extract content using the main content extraction method
|
|
content = self.extract_main_content(data, max_chars=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))
|
|
|
|
result = {
|
|
'title': title,
|
|
'url': url,
|
|
'snippet': 'No description', # Default value
|
|
'data': content
|
|
}
|
|
return result
|
|
|
|
|
|
# Singleton instance
|
|
_webcrawler_agent = None
|
|
|
|
def get_webcrawler_agent():
|
|
"""Returns a singleton instance of the WebCrawler Agent"""
|
|
global _webcrawler_agent
|
|
if _webcrawler_agent is None:
|
|
_webcrawler_agent = WebcrawlerAgent()
|
|
return _webcrawler_agent |