gateway/gwserver/modules/BAKwebcrawler.py
2025-04-15 01:04:38 +02:00

613 lines
No EOL
24 KiB
Python

"""
WebCrawler-Agent für die Recherche und Beschaffung von Informationen aus dem Web.
Angepasst für das refaktorisierte Core-Modul.
"""
import json
import logging
import random
import time
import traceback
from typing import List, Dict, Any, Optional, Union
import re
import uuid
from datetime import datetime
from urllib.parse import quote_plus, unquote
from bs4 import BeautifulSoup
import requests
from modules.agentservice_base import BaseAgent
from connectors.connector_aichat_openai import ChatService
from modules.agentservice_utils import WorkflowUtils, MessageUtils, LoggingUtils
from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtocol
logger = logging.getLogger(__name__)
class WebcrawlerAgent(BaseAgent):
"""Agent für Web-Recherche und Informationsbeschaffung"""
def __init__(self):
"""Initialisiert den WebCrawler-Agenten"""
super().__init__()
self.id = "webcrawler"
self.name = "Webscraper"
self.type = "scraper"
self.description = "Recherchiert Informationen im Web"
self.capabilities = "web_search,information_retrieval,data_collection,source_verification,content_integration"
self.result_format = "SearchResults"
# Add enhanced document capabilities
self.supports_documents = True
self.document_capabilities = ["read", "create"]
self.required_context = ["workflow_id"]
self.document_handler = None
# Initialize protocol
self.protocol = AgentCommunicationProtocol()
# Chat-Service initialisieren
self.chat_service = ChatService()
# Utility-Klassen initialisieren
self.message_utils = MessageUtils()
# Web-Crawling-Konfiguration
self.max_url = 3
self.max_key = 3
self.max_result = 3
self.timeout = 10
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.google.com/',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
self.max_urls = 10
self.max_content_length = 100000
def get_agent_info(self) -> Dict[str, Any]:
"""Get agent information for agent registry"""
info = super().get_agent_info()
info.update({
"metadata": {
"max_url": self.max_url,
"max_result": self.max_result,
"timeout": self.timeout
}
})
return info
def set_document_handler(self, document_handler):
"""Set the document handler for file operations"""
self.document_handler = document_handler
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
"""
Verarbeitet eine Nachricht und führt eine Web-Recherche durch.
Args:
message: Die zu verarbeitende Nachricht
context: Zusätzlicher Kontext
Returns:
Die generierte Antwort mit der Web-Recherche
"""
# Extract workflow_id from context or message
workflow_id = context.get("workflow_id") if context else message.get("workflow_id", "unknown")
# Get or create logging_utils
log_func = context.get("log_func") if context else None
logging_utils = LoggingUtils(workflow_id, log_func)
# Send status update using protocol
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Starte Web-Recherche",
sender_id=self.id,
status="in_progress",
progress=0.0,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Create response structure
response = {
"role": "assistant",
"content": "",
"agent_id": self.id,
"agent_type": self.type,
"agent_name": self.name,
"result_format": self.result_format,
"workflow_id": workflow_id
}
try:
# Get the query from the message
prompt = await self.get_prompt(message)
logging_utils.info(f"Web-Recherche für: {prompt[:50]}...", "agents")
# Update progress using protocol
if log_func:
status_message = self.protocol.create_status_update_message(
status_description=f"Recherchiere: {prompt[:30]}...",
sender_id=self.id,
status="in_progress",
progress=0.3,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Führe die Web-Recherche durch
web_query_result = await self.get_web_query(message)
# Final status update
if log_func:
status_message = self.protocol.create_status_update_message(
status_description="Web-Recherche abgeschlossen",
sender_id=self.id,
status="completed",
progress=1.0,
context_id=workflow_id
)
log_func(workflow_id, status_message.content, "info", self.id, self.name)
# Set the content in the response
response["content"] = web_query_result
return response
except Exception as e:
error_msg = f"Fehler bei der Web-Recherche: {str(e)}"
logging_utils.error(error_msg, "error")
# Create error response using protocol
error_message = self.protocol.create_error_message(
error_description=error_msg,
sender_id=self.id,
error_type="web_search",
error_details={"traceback": traceback.format_exc()},
context_id=workflow_id
)
response["content"] = f"## Fehler bei der Web-Recherche\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"
return response
def send_document_request(self, document_description: str, sender_id: str, receiver_id: str, filters: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
"""Send a document request using the protocol"""
return self.protocol.create_document_request_message(
document_description=document_description,
sender_id=sender_id,
receiver_id=receiver_id,
filters=filters,
context_id=context_id
)
def send_result_message(self, result_content: str, sender_id: str, receiver_id: str, task_id: str,
output_data: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
"""Send a result message using the protocol"""
return self.protocol.create_result_message(
result_content=result_content,
sender_id=sender_id,
receiver_id=receiver_id,
task_id=task_id,
output_data=output_data,
result_format="SearchResults",
context_id=context_id
)
async def get_prompt(self, message_context: Dict[str, Any]) -> str:
task = message_context.get("content", "")
return task.strip()
async def get_web_query(self, message_context: Dict[str, Any]) -> str:
prompt = await self.get_prompt(message_context)
result_json = await self.run_web_query(prompt)
result_data = ""
summary_src = ""
logger.info(f"Web analysis prompt '{prompt}' delivers {len(result_json)} results.")
if isinstance(result_json, list):
total_tokens = 0
for i, result in enumerate(result_json, 1):
# Limit content size for each result
result_data_limited = self.limit_text_for_api(result['data'], max_tokens=15000) # Allow ~15000 tokens per result
web_answer_instructions = f"""
Fass das Resultat gemäss dem Auftrag zusammen in maximal rund 2000 Zeichen. Auftrag = '{prompt.replace("'","")}'
Fasse die wichtigsten Erkenntnisse zusammen und setze sie in Bezug zur ursprünglichen Anfrage. Die Einleitung kannst Du weglassen.
Achte darauf, nur relevante und qualitativ hochwertige Informationen zu extrahieren, welche einen Bezug zum Auftrag haben, und übersichtlich zu präsentieren. Vermittle ein ausgewogenes Bild der recherchierten Informationen.
Dies ist das Resultat:
{result_data_limited}
"""
# Count tokens in the instructions to ensure we don't exceed API limits
instruction_tokens = self.count_tokens(web_answer_instructions)
if total_tokens + instruction_tokens > 60000:
logger.warning(f"Skipping result {i} to avoid exceeding token limit")
break
total_tokens += instruction_tokens
# Zusätzliche Anweisungen für Web-Recherche
content_text = await self.chat_service.call_api(
messages=[
{
"role": "system",
"content": "Du bist ein Informationsanalyst, der Webinhalte präzise und relevant zusammenfasst."
},
{
"role": "user",
"content": web_answer_instructions
}
]
)
# Create a summary but ensure we stay within token limits
content_summary = content_text[:2000] # Limit to ~2000 characters
result_data += f"\n\n[{i}] {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\nContent: {content_summary}"
summary_src += f"\n{content_summary}"
# Update token count
total_tokens += self.count_tokens(content_summary) + 100 # Add buffer for formatting
else:
result_data = "no data received"
logger.info(f"Web analysis result sent {len(result_data)}B")
# Zusätzliche Zusammenfassung
summary = ""
if len(summary_src) > 1:
# Limit summary source to ensure we don't exceed API limits
summary_src_limited = self.limit_text_for_api(summary_src, max_tokens=10000)
summary = await self.chat_service.call_api(
messages=[
{
"role": "system",
"content": "Du erstellst prägnante Zusammenfassungen von Rechercheergbnissen."
},
{
"role": "user",
"content": f"Bitte fasse diese Erkenntnisse in maximal 5-6 Sätzen zusammen: {summary_src_limited}\n"
}
]
)
# Format the final result
result = f"## Web-Recherche Ergebnisse\n\n### Zusammenfassung\n{summary}\n\n### Detaillierte Ergebnisse{result_data}"
return result
async def run_web_query(self, prompt: str) -> List[Dict]:
if prompt=="":
return []
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
"""
content_text = await self.chat_service.call_api(
messages=[
{
"role": "system",
"content": "Du bist ein Webrecherche-Experte, der präzise Suchstrategien entwickelt."
},
{
"role": "user",
"content": ptext
}
]
)
# Remove markdown formatting if present
if content_text.startswith("```json"):
# Find the end of the JSON block
end_marker = "```"
end_index = content_text.rfind(end_marker)
if end_index != -1:
# Extract the JSON content without the markdown markers
content_text = content_text[7:end_index].strip()
# Now parse the JSON
try:
logger.info(f"Valid json received: {str(content_text)}")
pjson = json.loads(content_text)
# Now call scrape_json with the parsed dictionary
result_json = await self.scrape_json(pjson)
return result_json
except json.JSONDecodeError as e:
logger.error(f"Failed to parse JSON: {e}")
logger.error(f"Cleaned content: {content_text[:100]}...")
return []
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
"""
Scrapes web content based on a research strategy JSON.
Args:
research_strategy: A dictionary containing:
- 'skey': List of search keywords
- 'url': List of direct URLs to scrape
Returns:
Dictionary with URLs as keys and scraped content as values
"""
logger.info("Starting JSON-based web scraping")
results = []
# Validate input structure
if not isinstance(research_strategy, dict):
logger.error("Invalid research_strategy format: not a dictionary")
return {"error": "Invalid research_strategy format: not a dictionary"}
keys = research_strategy.get("skey", [])
direct_urls = research_strategy.get("url", [])
if not isinstance(keys, list) or not isinstance(direct_urls, list):
logger.error("Invalid research_strategy format: keys, or url is not a list")
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
# Process search keywords through search engine
for keyword in keys:
logger.info(f"Processing keyword: {keyword}")
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
logger.info(f"... {len(found_results)} results found")
results.extend(found_results)
# Process direct URLs
logger.info(f"Processing {len(direct_urls)} direct URLs")
for url in direct_urls:
if url in results:
logger.info(f"Skipping already scraped URL: {url}")
continue
soup=self.read_url(url)
# Extract title from the page if it exists
if isinstance(soup, BeautifulSoup):
title_tag = soup.find('title')
title = title_tag.text.strip() if title_tag else "No title"
# Alternative: You could also look for h1 tags if the title tag is missing
if title == "No title":
h1_tag = soup.find('h1')
if h1_tag:
title = h1_tag.text.strip()
else:
# Handle the case where soup is an error message string
title = "Error fetching page"
results.append(self.parse_result(soup,"No title",url))
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
return results
def extract_main_content(self, soup: BeautifulSoup, max_chars: int = 30000) -> str:
"""
Extract the main content from an HTML page while limiting character count.
Args:
soup: BeautifulSoup object containing the page content
max_chars: Maximum number of characters to extract
Returns:
Extracted main content as string
"""
if not isinstance(soup, BeautifulSoup):
return str(soup)[:max_chars]
# Try to find main content elements in order of priority
main_content = None
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
content = soup.select_one(selector)
if content:
main_content = content
break
# If no main content found, use the body
if not main_content:
main_content = soup.find('body') or soup
# Remove script, style, nav, footer elements that don't contribute to main content
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
element.extract()
# Extract text content
text_content = main_content.get_text(separator=' ', strip=True)
# Limit to max_chars
return text_content[:max_chars]
def tokenize_for_counting(self, text: str) -> List[str]:
"""
Simple token counter for estimating token usage.
This is an approximation since the exact tokenization depends on the model.
Args:
text: Input text
Returns:
List of tokens
"""
# Simple tokenization by splitting on whitespace and punctuation
import re
return re.findall(r'\w+|[^\w\s]', text)
def count_tokens(self, text: str) -> int:
"""
Count the approximate number of tokens in a text.
Args:
text: Input text
Returns:
Estimated token count
"""
tokens = self.tokenize_for_counting(text)
return len(tokens)
def limit_text_for_api(self, text: str, max_tokens: int = 60000) -> str:
"""
Limit the text to a maximum number of tokens.
Args:
text: Input text
max_tokens: Maximum number of tokens allowed
Returns:
Limited text
"""
if not text:
return ""
tokens = self.tokenize_for_counting(text)
# If text is already under the limit, return as is
if len(tokens) <= max_tokens:
return text
# Otherwise, truncate text to max_tokens
return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]"
def search_web(self, query: str) -> List[Dict]:
formatted_query = quote_plus(query)
url = f"https://html.duckduckgo.com/html/?q={formatted_query}"
search_results_soup = self.read_url(url)
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
logger.warning(f"Keine Suchergebnisse gefunden für: {query}")
return []
# Extract search results
results = []
# Find all result containers
result_elements = search_results_soup.select('.result')
for result in result_elements:
# Extract title
title_element = result.select_one('.result__a')
title = title_element.text.strip() if title_element else 'No title'
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
url_element = title_element.get('href') if title_element else ''
extracted_url = 'No URL'
if url_element:
# Extract the actual URL from DuckDuckGo's redirect
if url_element.startswith('/d.js?q='):
start = url_element.find('?q=') + 3 # Skip '?q='
end = url_element.find('&', start) if '&' in url_element[start:] else None
extracted_url = unquote(url_element[start:end])
# Make sure the URL has the correct protocol prefix
if not extracted_url.startswith(('http://', 'https://')):
if not extracted_url.startswith('//'):
extracted_url = 'https://' + extracted_url
else:
extracted_url = 'https:' + extracted_url
else:
extracted_url = url_element
# Extract snippet directly from search results page
snippet_element = result.select_one('.result__snippet')
snippet = snippet_element.text.strip() if snippet_element else 'No description'
# Now fetch the actual page content for the data field
target_page_soup = self.read_url(extracted_url)
# Use the new content extraction method to limit content size
content = self.extract_main_content(target_page_soup, max_chars=30000)
results.append({
'title': title,
'url': extracted_url,
'snippet': snippet,
'data': content
})
# Limit the number of results if needed
if len(results) >= self.max_result:
break
return results
def read_url(self, url: str) -> BeautifulSoup:
"""
Liest eine URL und gibt einen BeautifulSoup-Parser für den Inhalt zurück.
Bei Fehlern wird ein leeres BeautifulSoup-Objekt zurückgegeben.
Args:
url: Die zu lesende URL
Returns:
BeautifulSoup-Objekt mit dem Inhalt oder leer bei Fehlern
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml',
'Accept-Language': 'en-US,en;q=0.9',
}
try:
import time
# Initialer Request
response = requests.get(url, headers=headers, timeout=10)
# Polling für Status 202
if response.status_code == 202:
# Maximal 3 Versuche mit steigenden Intervallen
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, dann 1s, dann 2s
for wait_time in backoff_times:
time.sleep(wait_time) # Warten mit steigender Zeit
response = requests.get(url, headers=headers, timeout=10)
# Wenn kein 202 mehr, dann abbrechen
if response.status_code != 202:
break
# Für andere Fehler-Status einen Fehler auslösen
response.raise_for_status()
# HTML parsen
return BeautifulSoup(response.text, 'html.parser')
except Exception as e:
# Leeres BeautifulSoup-Objekt erstellen
return BeautifulSoup("<html><body></body></html>", 'html.parser')
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
# Extract snippet/description
snippet_element = data.select_one('.result__snippet')
snippet = snippet_element.text.strip() if snippet_element else 'No description'
result={
'title': title,
'url': url,
'snippet': snippet,
'data': data.prettify()
}
return result
# Singleton-Instanz
_webcrawler_agent = None
def get_webcrawler_agent():
"""Gibt eine Singleton-Instanz des WebCrawler-Agenten zurück"""
global _webcrawler_agent
if _webcrawler_agent is None:
_webcrawler_agent = WebcrawlerAgent()
return _webcrawler_agent