430 lines
No EOL
18 KiB
Python
430 lines
No EOL
18 KiB
Python
"""
|
|
WebCrawler-Agent für die Recherche und Beschaffung von Informationen aus dem Web.
|
|
Angepasst für das refaktorisierte Core-Modul.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import random
|
|
import time
|
|
import traceback
|
|
from typing import List, Dict, Any, Optional, Union
|
|
import re
|
|
import uuid
|
|
from datetime import datetime
|
|
from urllib.parse import quote_plus, unquote
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
from modules.agentservice_base import BaseAgent
|
|
from connectors.connector_aichat_openai import ChatService
|
|
from modules.agentservice_utils import WorkflowUtils, MessageUtils, LoggingUtils
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class WebcrawlerAgent(BaseAgent):
|
|
"""Agent für Web-Recherche und Informationsbeschaffung"""
|
|
|
|
def __init__(self):
|
|
"""Initialisiert den WebCrawler-Agenten"""
|
|
super().__init__()
|
|
self.id = "webcrawler"
|
|
self.name = "Webscraper"
|
|
self.type = "scraper"
|
|
self.description = "Recherchiert Informationen im Web"
|
|
self.capabilities = "web_search,information_retrieval,data_collection,source_verification,content_integration"
|
|
self.result_format = "SearchResults"
|
|
|
|
# Chat-Service initialisieren
|
|
self.chat_service = ChatService()
|
|
|
|
# Utility-Klassen initialisieren
|
|
self.message_utils = MessageUtils()
|
|
|
|
# Web-Crawling-Konfiguration
|
|
self.max_url = 3
|
|
self.max_key = 3
|
|
self.max_result = 3
|
|
self.timeout = 10
|
|
self.headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Referer': 'https://www.google.com/',
|
|
'DNT': '1',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
}
|
|
self.max_urls = 10
|
|
self.max_content_length = 100000
|
|
|
|
def get_agent_info(self) -> Dict[str, Any]:
|
|
"""Get agent information for agent registry"""
|
|
return {
|
|
"id": self.id,
|
|
"type": self.type,
|
|
"name": self.name,
|
|
"description": self.description,
|
|
"capabilities": self.capabilities,
|
|
"result_format": self.result_format,
|
|
"metadata": {
|
|
"max_url": self.max_url,
|
|
"max_result": self.max_result,
|
|
"timeout": self.timeout
|
|
}
|
|
}
|
|
|
|
async def process_message(self, message: Dict[str, Any],
|
|
workflow: Dict[str, Any],
|
|
context: Dict[str, Any] = None,
|
|
log_func=None) -> Dict[str, Any]:
|
|
"""
|
|
Verarbeitet eine Nachricht und führt eine Web-Recherche durch.
|
|
|
|
Args:
|
|
message: Die zu verarbeitende Nachricht
|
|
workflow: Der aktuelle Workflow
|
|
context: Zusätzlicher Kontext
|
|
log_func: Funktion für Workflow-Logging
|
|
|
|
Returns:
|
|
Die generierte Antwort mit der Web-Recherche
|
|
"""
|
|
# Initialize logging
|
|
workflow_id = workflow.get("id", "unknown")
|
|
logging_utils = LoggingUtils(workflow_id, log_func)
|
|
logging_utils.info(f"WebcrawlerAgent startet Web-Recherche", "agents")
|
|
|
|
# Create response message
|
|
response = self.message_utils.create_message(workflow_id, role="assistant")
|
|
response["agent_type"] = self.type
|
|
response["agent_name"] = self.name
|
|
response["parent_message_id"] = message.get("id")
|
|
|
|
try:
|
|
# Get the query from the message
|
|
prompt = await self.get_prompt(message)
|
|
logging_utils.info(f"Web-Recherche für: {prompt[:50]}...", "agents")
|
|
|
|
# Führe die Web-Recherche durch und warte auf das Ergebnis mit await
|
|
web_query_result = await self.get_web_query(message)
|
|
logging_utils.info("Web-Recherche abgeschlossen", "agents")
|
|
|
|
# Set the content in the response
|
|
response["content"] = web_query_result
|
|
|
|
# Finalize the message
|
|
self.message_utils.finalize_message(response)
|
|
response["result_format"] = self.result_format
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
error_msg = f"Fehler bei der Web-Recherche: {str(e)}"
|
|
logging_utils.error(error_msg, "error")
|
|
|
|
# Create error response
|
|
response["content"] = f"## Fehler bei der Web-Recherche\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"
|
|
self.message_utils.finalize_message(response)
|
|
|
|
return response
|
|
|
|
async def get_prompt(self, message_context: Dict[str, Any]) -> str:
|
|
task = message_context.get("content", "")
|
|
return task.strip()
|
|
|
|
async def get_web_query(self, message_context: Dict[str, Any]) -> str:
|
|
prompt = await self.get_prompt(message_context)
|
|
result_json = await self.run_web_query(prompt)
|
|
result_data = ""
|
|
summary_src = ""
|
|
|
|
logger.info(f"Web analysis prompt '{prompt}' delivers {len(result_json)} results.")
|
|
if isinstance(result_json, list):
|
|
for i, result in enumerate(result_json, 1):
|
|
|
|
web_answer_instructions = f"""
|
|
Fass das Resultat gemäss dem Auftrag zusammen in maximal rund 2000 Zeichen. Auftrag = '{prompt.replace("'","")}'
|
|
Fasse die wichtigsten Erkenntnisse zusammen und setze sie in Bezug zur ursprünglichen Anfrage. Die Einleitung kannst Du weglassen.
|
|
Achte darauf, nur relevante und qualitativ hochwertige Informationen zu extrahieren, welche einen Bezug zum Auftrag haben, und übersichtlich zu präsentieren. Vermittle ein ausgewogenes Bild der recherchierten Informationen.
|
|
|
|
Dies ist das Resultat:
|
|
{result['data']}
|
|
"""
|
|
|
|
# Zusätzliche Anweisungen für Web-Recherche
|
|
content_text = await self.chat_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "Du bist ein Informationsanalyst, der Webinhalte präzise und relevant zusammenfasst."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": web_answer_instructions
|
|
}
|
|
]
|
|
)
|
|
result_data += f"\n\n[{i}] {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\nContent: {content_text}"
|
|
summary_src+=f"\n{content_text}"
|
|
else:
|
|
result_data = "no data received"
|
|
|
|
logger.info(f"Web analysis result sent {len(result_data)}B")
|
|
|
|
# Zusätzliche Zusammenfassung
|
|
summary=""
|
|
if len(summary_src)>1:
|
|
summary = await self.chat_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "Du erstellst prägnante Zusammenfassungen von Rechercheergbnissen."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Bitte fasse diese Erkenntnisse in maximal 5-6 Sätzen zusammen: {summary_src}\n"
|
|
}
|
|
]
|
|
)
|
|
|
|
# Format the final result
|
|
result = f"## Web-Recherche Ergebnisse\n\n### Zusammenfassung\n{summary}\n\n### Detaillierte Ergebnisse{result_data}"
|
|
return result
|
|
|
|
async def run_web_query(self, prompt: str) -> List[Dict]:
|
|
if prompt=="":
|
|
return []
|
|
|
|
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
|
|
|
|
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
|
|
|
|
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
|
|
|
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
|
|
"""
|
|
|
|
content_text = await self.chat_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "Du bist ein Webrecherche-Experte, der präzise Suchstrategien entwickelt."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": ptext
|
|
}
|
|
]
|
|
)
|
|
# Remove markdown formatting if present
|
|
if content_text.startswith("```json"):
|
|
# Find the end of the JSON block
|
|
end_marker = "```"
|
|
end_index = content_text.rfind(end_marker)
|
|
if end_index != -1:
|
|
# Extract the JSON content without the markdown markers
|
|
content_text = content_text[7:end_index].strip()
|
|
|
|
# Now parse the JSON
|
|
try:
|
|
logger.info(f"Valid json received: {str(content_text)}")
|
|
pjson = json.loads(content_text)
|
|
# Now call scrape_json with the parsed dictionary
|
|
result_json = await self.scrape_json(pjson)
|
|
return result_json
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to parse JSON: {e}")
|
|
logger.error(f"Cleaned content: {content_text[:100]}...")
|
|
return []
|
|
|
|
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
|
|
"""
|
|
Scrapes web content based on a research strategy JSON.
|
|
|
|
Args:
|
|
research_strategy: A dictionary containing:
|
|
- 'skey': List of search keywords
|
|
- 'url': List of direct URLs to scrape
|
|
|
|
Returns:
|
|
Dictionary with URLs as keys and scraped content as values
|
|
"""
|
|
|
|
logger.info("Starting JSON-based web scraping")
|
|
results = []
|
|
|
|
# Validate input structure
|
|
if not isinstance(research_strategy, dict):
|
|
logger.error("Invalid research_strategy format: not a dictionary")
|
|
return {"error": "Invalid research_strategy format: not a dictionary"}
|
|
|
|
keys = research_strategy.get("skey", [])
|
|
direct_urls = research_strategy.get("url", [])
|
|
|
|
if not isinstance(keys, list) or not isinstance(direct_urls, list):
|
|
logger.error("Invalid research_strategy format: keys, or url is not a list")
|
|
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
|
|
|
|
# Process search keywords through search engine
|
|
for keyword in keys:
|
|
logger.info(f"Processing keyword: {keyword}")
|
|
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
|
|
logger.info(f"... {len(found_results)} results found")
|
|
results.extend(found_results)
|
|
|
|
# Process direct URLs
|
|
logger.info(f"Processing {len(direct_urls)} direct URLs")
|
|
for url in direct_urls:
|
|
if url in results:
|
|
logger.info(f"Skipping already scraped URL: {url}")
|
|
continue
|
|
soup=self.read_url(url)
|
|
|
|
# Extract title from the page if it exists
|
|
if isinstance(soup, BeautifulSoup):
|
|
title_tag = soup.find('title')
|
|
title = title_tag.text.strip() if title_tag else "No title"
|
|
|
|
# Alternative: You could also look for h1 tags if the title tag is missing
|
|
if title == "No title":
|
|
h1_tag = soup.find('h1')
|
|
if h1_tag:
|
|
title = h1_tag.text.strip()
|
|
else:
|
|
# Handle the case where soup is an error message string
|
|
title = "Error fetching page"
|
|
|
|
results.append(self.parse_result(soup,"No title",url))
|
|
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
|
|
return results
|
|
|
|
def search_web(self, query: str) -> List[Dict]:
|
|
formatted_query = quote_plus(query)
|
|
url = f"https://html.duckduckgo.com/html/?q={formatted_query}"
|
|
|
|
search_results_soup = self.read_url(url)
|
|
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
|
|
logger.warning(f"Keine Suchergebnisse gefunden für: {query}")
|
|
return []
|
|
|
|
# Extract search results
|
|
results = []
|
|
|
|
# Find all result containers
|
|
result_elements = search_results_soup.select('.result')
|
|
|
|
for result in result_elements:
|
|
# Extract title
|
|
title_element = result.select_one('.result__a')
|
|
title = title_element.text.strip() if title_element else 'No title'
|
|
|
|
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
|
|
url_element = title_element.get('href') if title_element else ''
|
|
extracted_url = 'No URL'
|
|
|
|
if url_element:
|
|
# Extract the actual URL from DuckDuckGo's redirect
|
|
if url_element.startswith('/d.js?q='):
|
|
start = url_element.find('?q=') + 3 # Skip '?q='
|
|
end = url_element.find('&', start) if '&' in url_element[start:] else None
|
|
extracted_url = unquote(url_element[start:end])
|
|
|
|
# Make sure the URL has the correct protocol prefix
|
|
if not extracted_url.startswith(('http://', 'https://')):
|
|
if not extracted_url.startswith('//'):
|
|
extracted_url = 'https://' + extracted_url
|
|
else:
|
|
extracted_url = 'https:' + extracted_url
|
|
else:
|
|
extracted_url = url_element
|
|
|
|
# Extract snippet directly from search results page
|
|
snippet_element = result.select_one('.result__snippet')
|
|
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
|
|
|
# Now fetch the actual page content for the data field
|
|
target_page_soup = self.read_url(extracted_url)
|
|
|
|
results.append({
|
|
'title': title,
|
|
'url': extracted_url,
|
|
'snippet': snippet,
|
|
'data': str(target_page_soup) if isinstance(target_page_soup, BeautifulSoup) else "Error fetching page"
|
|
})
|
|
|
|
# Limit the number of results if needed
|
|
if len(results) >= self.max_result:
|
|
break
|
|
|
|
return results
|
|
|
|
def read_url(self, url: str) -> BeautifulSoup:
|
|
"""
|
|
Liest eine URL und gibt einen BeautifulSoup-Parser für den Inhalt zurück.
|
|
Bei Fehlern wird ein leeres BeautifulSoup-Objekt zurückgegeben.
|
|
|
|
Args:
|
|
url: Die zu lesende URL
|
|
|
|
Returns:
|
|
BeautifulSoup-Objekt mit dem Inhalt oder leer bei Fehlern
|
|
"""
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
}
|
|
|
|
try:
|
|
import time
|
|
|
|
# Initialer Request
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
|
|
# Polling für Status 202
|
|
if response.status_code == 202:
|
|
# Maximal 3 Versuche mit steigenden Intervallen
|
|
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, dann 1s, dann 2s
|
|
|
|
for wait_time in backoff_times:
|
|
time.sleep(wait_time) # Warten mit steigender Zeit
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
|
|
# Wenn kein 202 mehr, dann abbrechen
|
|
if response.status_code != 202:
|
|
break
|
|
|
|
# Für andere Fehler-Status einen Fehler auslösen
|
|
response.raise_for_status()
|
|
|
|
# HTML parsen
|
|
return BeautifulSoup(response.text, 'html.parser')
|
|
|
|
except Exception as e:
|
|
# Leeres BeautifulSoup-Objekt erstellen
|
|
return BeautifulSoup("<html><body></body></html>", 'html.parser')
|
|
|
|
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
|
|
# Extract snippet/description
|
|
snippet_element = data.select_one('.result__snippet')
|
|
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
|
|
|
result={
|
|
'title': title,
|
|
'url': url,
|
|
'snippet': snippet,
|
|
'data': data.prettify()
|
|
}
|
|
return result
|
|
|
|
# Singleton-Instanz
|
|
_webcrawler_agent = None
|
|
|
|
def get_webcrawler_agent():
|
|
"""Gibt eine Singleton-Instanz des WebCrawler-Agenten zurück"""
|
|
global _webcrawler_agent
|
|
if _webcrawler_agent is None:
|
|
_webcrawler_agent = WebcrawlerAgent()
|
|
return _webcrawler_agent |