613 lines
No EOL
24 KiB
Python
613 lines
No EOL
24 KiB
Python
"""
|
|
WebCrawler-Agent für die Recherche und Beschaffung von Informationen aus dem Web.
|
|
Angepasst für das refaktorisierte Core-Modul.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import random
|
|
import time
|
|
import traceback
|
|
from typing import List, Dict, Any, Optional, Union
|
|
import re
|
|
import uuid
|
|
from datetime import datetime
|
|
from urllib.parse import quote_plus, unquote
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
from modules.agentservice_base import BaseAgent
|
|
from connectors.connector_aichat_openai import ChatService
|
|
from modules.agentservice_utils import WorkflowUtils, MessageUtils, LoggingUtils
|
|
from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtocol
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class WebcrawlerAgent(BaseAgent):
|
|
|
|
"""Agent für Web-Recherche und Informationsbeschaffung"""
|
|
|
|
def __init__(self):
|
|
"""Initialisiert den WebCrawler-Agenten"""
|
|
super().__init__()
|
|
self.id = "webcrawler"
|
|
self.name = "Webscraper"
|
|
self.type = "scraper"
|
|
self.description = "Recherchiert Informationen im Web"
|
|
self.capabilities = "web_search,information_retrieval,data_collection,source_verification,content_integration"
|
|
self.result_format = "SearchResults"
|
|
|
|
# Add enhanced document capabilities
|
|
self.supports_documents = True
|
|
self.document_capabilities = ["read", "create"]
|
|
self.required_context = ["workflow_id"]
|
|
self.document_handler = None
|
|
|
|
# Initialize protocol
|
|
self.protocol = AgentCommunicationProtocol()
|
|
|
|
# Chat-Service initialisieren
|
|
self.chat_service = ChatService()
|
|
|
|
# Utility-Klassen initialisieren
|
|
self.message_utils = MessageUtils()
|
|
|
|
# Web-Crawling-Konfiguration
|
|
self.max_url = 3
|
|
self.max_key = 3
|
|
self.max_result = 3
|
|
self.timeout = 10
|
|
self.headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Referer': 'https://www.google.com/',
|
|
'DNT': '1',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
}
|
|
self.max_urls = 10
|
|
self.max_content_length = 100000
|
|
|
|
def get_agent_info(self) -> Dict[str, Any]:
|
|
"""Get agent information for agent registry"""
|
|
info = super().get_agent_info()
|
|
info.update({
|
|
"metadata": {
|
|
"max_url": self.max_url,
|
|
"max_result": self.max_result,
|
|
"timeout": self.timeout
|
|
}
|
|
})
|
|
return info
|
|
|
|
def set_document_handler(self, document_handler):
|
|
"""Set the document handler for file operations"""
|
|
self.document_handler = document_handler
|
|
|
|
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
|
"""
|
|
Verarbeitet eine Nachricht und führt eine Web-Recherche durch.
|
|
|
|
Args:
|
|
message: Die zu verarbeitende Nachricht
|
|
context: Zusätzlicher Kontext
|
|
|
|
Returns:
|
|
Die generierte Antwort mit der Web-Recherche
|
|
"""
|
|
# Extract workflow_id from context or message
|
|
workflow_id = context.get("workflow_id") if context else message.get("workflow_id", "unknown")
|
|
|
|
# Get or create logging_utils
|
|
log_func = context.get("log_func") if context else None
|
|
logging_utils = LoggingUtils(workflow_id, log_func)
|
|
|
|
# Send status update using protocol
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description="Starte Web-Recherche",
|
|
sender_id=self.id,
|
|
status="in_progress",
|
|
progress=0.0,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
# Create response structure
|
|
response = {
|
|
"role": "assistant",
|
|
"content": "",
|
|
"agent_id": self.id,
|
|
"agent_type": self.type,
|
|
"agent_name": self.name,
|
|
"result_format": self.result_format,
|
|
"workflow_id": workflow_id
|
|
}
|
|
|
|
try:
|
|
# Get the query from the message
|
|
prompt = await self.get_prompt(message)
|
|
logging_utils.info(f"Web-Recherche für: {prompt[:50]}...", "agents")
|
|
|
|
# Update progress using protocol
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description=f"Recherchiere: {prompt[:30]}...",
|
|
sender_id=self.id,
|
|
status="in_progress",
|
|
progress=0.3,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
# Führe die Web-Recherche durch
|
|
web_query_result = await self.get_web_query(message)
|
|
|
|
# Final status update
|
|
if log_func:
|
|
status_message = self.protocol.create_status_update_message(
|
|
status_description="Web-Recherche abgeschlossen",
|
|
sender_id=self.id,
|
|
status="completed",
|
|
progress=1.0,
|
|
context_id=workflow_id
|
|
)
|
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
|
|
# Set the content in the response
|
|
response["content"] = web_query_result
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
error_msg = f"Fehler bei der Web-Recherche: {str(e)}"
|
|
logging_utils.error(error_msg, "error")
|
|
|
|
# Create error response using protocol
|
|
error_message = self.protocol.create_error_message(
|
|
error_description=error_msg,
|
|
sender_id=self.id,
|
|
error_type="web_search",
|
|
error_details={"traceback": traceback.format_exc()},
|
|
context_id=workflow_id
|
|
)
|
|
|
|
response["content"] = f"## Fehler bei der Web-Recherche\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"
|
|
|
|
return response
|
|
|
|
def send_document_request(self, document_description: str, sender_id: str, receiver_id: str, filters: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
|
|
"""Send a document request using the protocol"""
|
|
return self.protocol.create_document_request_message(
|
|
document_description=document_description,
|
|
sender_id=sender_id,
|
|
receiver_id=receiver_id,
|
|
filters=filters,
|
|
context_id=context_id
|
|
)
|
|
|
|
def send_result_message(self, result_content: str, sender_id: str, receiver_id: str, task_id: str,
|
|
output_data: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
|
|
"""Send a result message using the protocol"""
|
|
return self.protocol.create_result_message(
|
|
result_content=result_content,
|
|
sender_id=sender_id,
|
|
receiver_id=receiver_id,
|
|
task_id=task_id,
|
|
output_data=output_data,
|
|
result_format="SearchResults",
|
|
context_id=context_id
|
|
)
|
|
|
|
async def get_prompt(self, message_context: Dict[str, Any]) -> str:
|
|
task = message_context.get("content", "")
|
|
return task.strip()
|
|
|
|
async def get_web_query(self, message_context: Dict[str, Any]) -> str:
|
|
prompt = await self.get_prompt(message_context)
|
|
result_json = await self.run_web_query(prompt)
|
|
result_data = ""
|
|
summary_src = ""
|
|
|
|
logger.info(f"Web analysis prompt '{prompt}' delivers {len(result_json)} results.")
|
|
if isinstance(result_json, list):
|
|
total_tokens = 0
|
|
|
|
for i, result in enumerate(result_json, 1):
|
|
# Limit content size for each result
|
|
result_data_limited = self.limit_text_for_api(result['data'], max_tokens=15000) # Allow ~15000 tokens per result
|
|
|
|
web_answer_instructions = f"""
|
|
Fass das Resultat gemäss dem Auftrag zusammen in maximal rund 2000 Zeichen. Auftrag = '{prompt.replace("'","")}'
|
|
Fasse die wichtigsten Erkenntnisse zusammen und setze sie in Bezug zur ursprünglichen Anfrage. Die Einleitung kannst Du weglassen.
|
|
Achte darauf, nur relevante und qualitativ hochwertige Informationen zu extrahieren, welche einen Bezug zum Auftrag haben, und übersichtlich zu präsentieren. Vermittle ein ausgewogenes Bild der recherchierten Informationen.
|
|
|
|
Dies ist das Resultat:
|
|
{result_data_limited}
|
|
"""
|
|
|
|
# Count tokens in the instructions to ensure we don't exceed API limits
|
|
instruction_tokens = self.count_tokens(web_answer_instructions)
|
|
if total_tokens + instruction_tokens > 60000:
|
|
logger.warning(f"Skipping result {i} to avoid exceeding token limit")
|
|
break
|
|
|
|
total_tokens += instruction_tokens
|
|
|
|
# Zusätzliche Anweisungen für Web-Recherche
|
|
content_text = await self.chat_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "Du bist ein Informationsanalyst, der Webinhalte präzise und relevant zusammenfasst."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": web_answer_instructions
|
|
}
|
|
]
|
|
)
|
|
|
|
# Create a summary but ensure we stay within token limits
|
|
content_summary = content_text[:2000] # Limit to ~2000 characters
|
|
result_data += f"\n\n[{i}] {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\nContent: {content_summary}"
|
|
summary_src += f"\n{content_summary}"
|
|
|
|
# Update token count
|
|
total_tokens += self.count_tokens(content_summary) + 100 # Add buffer for formatting
|
|
else:
|
|
result_data = "no data received"
|
|
|
|
logger.info(f"Web analysis result sent {len(result_data)}B")
|
|
|
|
# Zusätzliche Zusammenfassung
|
|
summary = ""
|
|
if len(summary_src) > 1:
|
|
# Limit summary source to ensure we don't exceed API limits
|
|
summary_src_limited = self.limit_text_for_api(summary_src, max_tokens=10000)
|
|
|
|
summary = await self.chat_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "Du erstellst prägnante Zusammenfassungen von Rechercheergbnissen."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Bitte fasse diese Erkenntnisse in maximal 5-6 Sätzen zusammen: {summary_src_limited}\n"
|
|
}
|
|
]
|
|
)
|
|
|
|
# Format the final result
|
|
result = f"## Web-Recherche Ergebnisse\n\n### Zusammenfassung\n{summary}\n\n### Detaillierte Ergebnisse{result_data}"
|
|
return result
|
|
|
|
async def run_web_query(self, prompt: str) -> List[Dict]:
|
|
if prompt=="":
|
|
return []
|
|
|
|
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
|
|
|
|
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
|
|
|
|
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
|
|
|
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
|
|
"""
|
|
|
|
content_text = await self.chat_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "Du bist ein Webrecherche-Experte, der präzise Suchstrategien entwickelt."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": ptext
|
|
}
|
|
]
|
|
)
|
|
# Remove markdown formatting if present
|
|
if content_text.startswith("```json"):
|
|
# Find the end of the JSON block
|
|
end_marker = "```"
|
|
end_index = content_text.rfind(end_marker)
|
|
if end_index != -1:
|
|
# Extract the JSON content without the markdown markers
|
|
content_text = content_text[7:end_index].strip()
|
|
|
|
# Now parse the JSON
|
|
try:
|
|
logger.info(f"Valid json received: {str(content_text)}")
|
|
pjson = json.loads(content_text)
|
|
# Now call scrape_json with the parsed dictionary
|
|
result_json = await self.scrape_json(pjson)
|
|
return result_json
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to parse JSON: {e}")
|
|
logger.error(f"Cleaned content: {content_text[:100]}...")
|
|
return []
|
|
|
|
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
|
|
"""
|
|
Scrapes web content based on a research strategy JSON.
|
|
|
|
Args:
|
|
research_strategy: A dictionary containing:
|
|
- 'skey': List of search keywords
|
|
- 'url': List of direct URLs to scrape
|
|
|
|
Returns:
|
|
Dictionary with URLs as keys and scraped content as values
|
|
"""
|
|
|
|
logger.info("Starting JSON-based web scraping")
|
|
results = []
|
|
|
|
# Validate input structure
|
|
if not isinstance(research_strategy, dict):
|
|
logger.error("Invalid research_strategy format: not a dictionary")
|
|
return {"error": "Invalid research_strategy format: not a dictionary"}
|
|
|
|
keys = research_strategy.get("skey", [])
|
|
direct_urls = research_strategy.get("url", [])
|
|
|
|
if not isinstance(keys, list) or not isinstance(direct_urls, list):
|
|
logger.error("Invalid research_strategy format: keys, or url is not a list")
|
|
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
|
|
|
|
# Process search keywords through search engine
|
|
for keyword in keys:
|
|
logger.info(f"Processing keyword: {keyword}")
|
|
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
|
|
logger.info(f"... {len(found_results)} results found")
|
|
results.extend(found_results)
|
|
|
|
# Process direct URLs
|
|
logger.info(f"Processing {len(direct_urls)} direct URLs")
|
|
for url in direct_urls:
|
|
if url in results:
|
|
logger.info(f"Skipping already scraped URL: {url}")
|
|
continue
|
|
soup=self.read_url(url)
|
|
|
|
# Extract title from the page if it exists
|
|
if isinstance(soup, BeautifulSoup):
|
|
title_tag = soup.find('title')
|
|
title = title_tag.text.strip() if title_tag else "No title"
|
|
|
|
# Alternative: You could also look for h1 tags if the title tag is missing
|
|
if title == "No title":
|
|
h1_tag = soup.find('h1')
|
|
if h1_tag:
|
|
title = h1_tag.text.strip()
|
|
else:
|
|
# Handle the case where soup is an error message string
|
|
title = "Error fetching page"
|
|
|
|
results.append(self.parse_result(soup,"No title",url))
|
|
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
|
|
return results
|
|
|
|
def extract_main_content(self, soup: BeautifulSoup, max_chars: int = 30000) -> str:
|
|
"""
|
|
Extract the main content from an HTML page while limiting character count.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object containing the page content
|
|
max_chars: Maximum number of characters to extract
|
|
|
|
Returns:
|
|
Extracted main content as string
|
|
"""
|
|
if not isinstance(soup, BeautifulSoup):
|
|
return str(soup)[:max_chars]
|
|
|
|
# Try to find main content elements in order of priority
|
|
main_content = None
|
|
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
|
|
content = soup.select_one(selector)
|
|
if content:
|
|
main_content = content
|
|
break
|
|
|
|
# If no main content found, use the body
|
|
if not main_content:
|
|
main_content = soup.find('body') or soup
|
|
|
|
# Remove script, style, nav, footer elements that don't contribute to main content
|
|
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
|
|
element.extract()
|
|
|
|
# Extract text content
|
|
text_content = main_content.get_text(separator=' ', strip=True)
|
|
|
|
# Limit to max_chars
|
|
return text_content[:max_chars]
|
|
|
|
def tokenize_for_counting(self, text: str) -> List[str]:
|
|
"""
|
|
Simple token counter for estimating token usage.
|
|
This is an approximation since the exact tokenization depends on the model.
|
|
|
|
Args:
|
|
text: Input text
|
|
|
|
Returns:
|
|
List of tokens
|
|
"""
|
|
# Simple tokenization by splitting on whitespace and punctuation
|
|
import re
|
|
return re.findall(r'\w+|[^\w\s]', text)
|
|
|
|
def count_tokens(self, text: str) -> int:
|
|
"""
|
|
Count the approximate number of tokens in a text.
|
|
|
|
Args:
|
|
text: Input text
|
|
|
|
Returns:
|
|
Estimated token count
|
|
"""
|
|
tokens = self.tokenize_for_counting(text)
|
|
return len(tokens)
|
|
|
|
def limit_text_for_api(self, text: str, max_tokens: int = 60000) -> str:
|
|
"""
|
|
Limit the text to a maximum number of tokens.
|
|
|
|
Args:
|
|
text: Input text
|
|
max_tokens: Maximum number of tokens allowed
|
|
|
|
Returns:
|
|
Limited text
|
|
"""
|
|
if not text:
|
|
return ""
|
|
|
|
tokens = self.tokenize_for_counting(text)
|
|
|
|
# If text is already under the limit, return as is
|
|
if len(tokens) <= max_tokens:
|
|
return text
|
|
|
|
# Otherwise, truncate text to max_tokens
|
|
return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]"
|
|
|
|
def search_web(self, query: str) -> List[Dict]:
|
|
formatted_query = quote_plus(query)
|
|
url = f"https://html.duckduckgo.com/html/?q={formatted_query}"
|
|
|
|
search_results_soup = self.read_url(url)
|
|
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
|
|
logger.warning(f"Keine Suchergebnisse gefunden für: {query}")
|
|
return []
|
|
|
|
# Extract search results
|
|
results = []
|
|
|
|
# Find all result containers
|
|
result_elements = search_results_soup.select('.result')
|
|
|
|
for result in result_elements:
|
|
# Extract title
|
|
title_element = result.select_one('.result__a')
|
|
title = title_element.text.strip() if title_element else 'No title'
|
|
|
|
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
|
|
url_element = title_element.get('href') if title_element else ''
|
|
extracted_url = 'No URL'
|
|
|
|
if url_element:
|
|
# Extract the actual URL from DuckDuckGo's redirect
|
|
if url_element.startswith('/d.js?q='):
|
|
start = url_element.find('?q=') + 3 # Skip '?q='
|
|
end = url_element.find('&', start) if '&' in url_element[start:] else None
|
|
extracted_url = unquote(url_element[start:end])
|
|
|
|
# Make sure the URL has the correct protocol prefix
|
|
if not extracted_url.startswith(('http://', 'https://')):
|
|
if not extracted_url.startswith('//'):
|
|
extracted_url = 'https://' + extracted_url
|
|
else:
|
|
extracted_url = 'https:' + extracted_url
|
|
else:
|
|
extracted_url = url_element
|
|
|
|
# Extract snippet directly from search results page
|
|
snippet_element = result.select_one('.result__snippet')
|
|
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
|
|
|
# Now fetch the actual page content for the data field
|
|
target_page_soup = self.read_url(extracted_url)
|
|
|
|
# Use the new content extraction method to limit content size
|
|
content = self.extract_main_content(target_page_soup, max_chars=30000)
|
|
|
|
results.append({
|
|
'title': title,
|
|
'url': extracted_url,
|
|
'snippet': snippet,
|
|
'data': content
|
|
})
|
|
|
|
# Limit the number of results if needed
|
|
if len(results) >= self.max_result:
|
|
break
|
|
|
|
return results
|
|
|
|
def read_url(self, url: str) -> BeautifulSoup:
|
|
"""
|
|
Liest eine URL und gibt einen BeautifulSoup-Parser für den Inhalt zurück.
|
|
Bei Fehlern wird ein leeres BeautifulSoup-Objekt zurückgegeben.
|
|
|
|
Args:
|
|
url: Die zu lesende URL
|
|
|
|
Returns:
|
|
BeautifulSoup-Objekt mit dem Inhalt oder leer bei Fehlern
|
|
"""
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
}
|
|
|
|
try:
|
|
import time
|
|
|
|
# Initialer Request
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
|
|
# Polling für Status 202
|
|
if response.status_code == 202:
|
|
# Maximal 3 Versuche mit steigenden Intervallen
|
|
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, dann 1s, dann 2s
|
|
|
|
for wait_time in backoff_times:
|
|
time.sleep(wait_time) # Warten mit steigender Zeit
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
|
|
# Wenn kein 202 mehr, dann abbrechen
|
|
if response.status_code != 202:
|
|
break
|
|
|
|
# Für andere Fehler-Status einen Fehler auslösen
|
|
response.raise_for_status()
|
|
|
|
# HTML parsen
|
|
return BeautifulSoup(response.text, 'html.parser')
|
|
|
|
except Exception as e:
|
|
# Leeres BeautifulSoup-Objekt erstellen
|
|
return BeautifulSoup("<html><body></body></html>", 'html.parser')
|
|
|
|
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
|
|
# Extract snippet/description
|
|
snippet_element = data.select_one('.result__snippet')
|
|
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
|
|
|
result={
|
|
'title': title,
|
|
'url': url,
|
|
'snippet': snippet,
|
|
'data': data.prettify()
|
|
}
|
|
return result
|
|
|
|
|
|
# Singleton-Instanz
|
|
_webcrawler_agent = None
|
|
|
|
def get_webcrawler_agent():
|
|
"""Gibt eine Singleton-Instanz des WebCrawler-Agenten zurück"""
|
|
global _webcrawler_agent
|
|
if _webcrawler_agent is None:
|
|
_webcrawler_agent = WebcrawlerAgent()
|
|
return _webcrawler_agent |