511 lines
No EOL
20 KiB
Python
511 lines
No EOL
20 KiB
Python
"""
|
|
WebCrawler-Agent für die Recherche und Beschaffung von Informationen aus dem Web.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import random
|
|
import time
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
import urllib
|
|
from urllib.parse import quote_plus, unquote
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
from modules.agentservice_base import BaseAgent
|
|
from connectors.connector_aichat_openai import ChatService
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class WebcrawlerAgent(BaseAgent):
|
|
"""Agent für Web-Recherche und Informationsbeschaffung"""
|
|
|
|
_instance = None
|
|
|
|
chat_service = ChatService()
|
|
|
|
#INIT --> should go to config
|
|
max_url=3
|
|
max_key=3
|
|
|
|
max_result=3
|
|
|
|
timeout = 10
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Referer': 'https://www.google.com/',
|
|
'DNT': '1',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
}
|
|
max_urls = 10
|
|
max_content_length=100000
|
|
|
|
|
|
@classmethod
|
|
def get_instance(cls):
|
|
"""Gibt eine Singleton-Instanz zurück"""
|
|
if cls._instance is None:
|
|
cls._instance = cls()
|
|
return cls._instance
|
|
|
|
def __init__(self):
|
|
"""Initialisiert den WebCrawler-Agenten"""
|
|
super().__init__()
|
|
self.id = "webcrawler_agent"
|
|
self.name = "Webscraper"
|
|
self.type = "scraper"
|
|
self.description = "Recherchiert Informationen im Web"
|
|
self.capabilities = "Informationsrecherche, Datenbeschaffung aus dem Web, Quellenbewertung und Zusammenführung von Online-Informationen"
|
|
self.instructions = ""
|
|
|
|
|
|
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
|
try:
|
|
# Führe die Web-Recherche durch und warte auf das Ergebnis mit await
|
|
web_query_result = await self.get_web_query(message)
|
|
|
|
# Antwort-Objekt erstellen
|
|
response = {
|
|
"role": "assistant",
|
|
"content": f"{web_query_result} [STATUS: ERGEBNIS]",
|
|
"agent_type": self.type
|
|
}
|
|
|
|
# Extrahiere den Status aus der Antwort und aktualisiere den Inhalt
|
|
content, status = self.extract_status(response["content"])
|
|
response["content"] = content
|
|
|
|
# Setze den Status im Kontext, falls vorhanden
|
|
if context is not None:
|
|
context["status"] = status
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
logger.error(f"Fehler bei der Web-Recherche: {str(e)}", exc_info=True)
|
|
|
|
# Fehlerantwort zurückgeben
|
|
return {
|
|
"role": "assistant",
|
|
"content": f"Bei der Web-Recherche ist ein Fehler aufgetreten: {str(e)}",
|
|
"agent_type": self.type
|
|
}
|
|
|
|
async def get_web_query(self, message_context: Dict[str, Any]) -> str:
|
|
prompt = await self.get_prompt(message_context)
|
|
result_json = await self.run_web_query(prompt)
|
|
result_data = ""
|
|
summary_src = ""
|
|
|
|
logger.info(f"Web analysis prompt '{prompt}' delivers {len(result_json)} results.")
|
|
if isinstance(result_json, list):
|
|
for i, result in enumerate(result_json, 1):
|
|
|
|
web_answer_instructions = f"""
|
|
Fass das Resultat gemäss dem Auftrag zusammen in maximal rund 2000 Zeichen. Auftrag = '{prompt.replace("'","")}'
|
|
Fasse die wichtigsten Erkenntnisse zusammen und setze sie in Bezug zur ursprünglichen Anfrage. Die Einleitung kannst Du weglassen.
|
|
Achte darauf, nur relevante und qualitativ hochwertige Informationen zu extrahieren, welche einen Bezug zum Auftrag haben, und übersichtlich zu präsentieren. Vermittle ein ausgewogenes Bild der recherchierten Informationen.
|
|
|
|
Dies ist das Resultat:
|
|
{result['data']}
|
|
"""
|
|
|
|
# Zusätzliche Anweisungen für Web-Recherche
|
|
content_text = await self.chat_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "Du bist ein Informationsanalyst, der Webinhalte präzise und relevant zusammenfasst."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": web_answer_instructions
|
|
}
|
|
]
|
|
)
|
|
result_data += f"\n\n[{i}] {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\nContent: {content_text}"
|
|
summary_src+=f"\n{content_text}"
|
|
else:
|
|
result_data = "no data received"
|
|
|
|
logger.info(f"Web analysis result sent {len(result_data)}B")
|
|
|
|
# Zusätzliche Zusammenfassung
|
|
summary=""
|
|
if len(summary_src)>1:
|
|
summary = await self.chat_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "Du erstellst prägnante Zusammenfassungen von Rechercheergbnissen."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Bitte fasse diese Erkenntnisse in maximal 5-6 Sätzen zusammen: {summary_src}\n"
|
|
}
|
|
]
|
|
)
|
|
result = f"{summary}\n\n{result_data}"
|
|
return result
|
|
|
|
|
|
async def get_prompt(self, message_context: Dict[str, Any]) -> str:
|
|
task = message_context.get("content", "")
|
|
return task.strip()
|
|
|
|
|
|
async def run_web_query(self, prompt: str) -> List[Dict]:
|
|
if prompt=="":
|
|
return []
|
|
|
|
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
|
|
|
|
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
|
|
|
|
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
|
|
|
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
|
|
"""
|
|
|
|
content_text = await self.chat_service.call_api(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "Du bist ein Webrecherche-Experte, der präzise Suchstrategien entwickelt."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": ptext
|
|
} ]
|
|
)
|
|
# Remove markdown formatting if present
|
|
if content_text.startswith("```json"):
|
|
# Find the end of the JSON block
|
|
end_marker = "```"
|
|
end_index = content_text.rfind(end_marker)
|
|
if end_index != -1:
|
|
# Extract the JSON content without the markdown markers
|
|
content_text = content_text[7:end_index].strip()
|
|
|
|
# Now parse the JSON
|
|
try:
|
|
logger.info(f"Valid json received: {str(content_text)}")
|
|
pjson = json.loads(content_text)
|
|
# Now call scrape_json with the parsed dictionary
|
|
result_json = await self.scrape_json(pjson)
|
|
return result_json
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to parse JSON: {e}")
|
|
logger.error(f"Cleaned content: {content_text[:100]}...")
|
|
return []
|
|
|
|
|
|
|
|
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
|
|
"""
|
|
Scrapes web content based on a research strategy JSON.
|
|
|
|
Args:
|
|
research_strategy: A dictionary containing:
|
|
- 'skey': List of search keywords
|
|
- 'url': List of direct URLs to scrape
|
|
|
|
Returns:
|
|
Dictionary with URLs as keys and scraped content as values
|
|
"""
|
|
|
|
logger.info("Starting JSON-based web scraping")
|
|
results = []
|
|
|
|
# Validate input structure
|
|
if not isinstance(research_strategy, dict):
|
|
logger.error("Invalid research_strategy format: not a dictionary")
|
|
return {"error": "Invalid research_strategy format: not a dictionary"}
|
|
|
|
keys = research_strategy.get("skey", [])
|
|
direct_urls = research_strategy.get("url", [])
|
|
|
|
if not isinstance(keys, list) or not isinstance(direct_urls, list):
|
|
logger.error("Invalid research_strategy format: keys, or url is not a list")
|
|
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
|
|
|
|
# Process search keywords through search engine
|
|
for keyword in keys:
|
|
logger.info(f"Processing keyword: {keyword}")
|
|
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
|
|
logger.info(f"... {len(found_results)} results found")
|
|
results.extend(found_results)
|
|
|
|
# Process direct URLs
|
|
logger.info(f"Processing {len(direct_urls)} direct URLs")
|
|
for url in direct_urls:
|
|
if url in results:
|
|
logger.info(f"Skipping already scraped URL: {url}")
|
|
continue
|
|
soup=self.read_url(url)
|
|
|
|
# Extract title from the page if it exists
|
|
if isinstance(soup, BeautifulSoup):
|
|
title_tag = soup.find('title')
|
|
title = title_tag.text.strip() if title_tag else "No title"
|
|
|
|
# Alternative: You could also look for h1 tags if the title tag is missing
|
|
if title == "No title":
|
|
h1_tag = soup.find('h1')
|
|
if h1_tag:
|
|
title = h1_tag.text.strip()
|
|
else:
|
|
# Handle the case where soup is an error message string
|
|
title = "Error fetching page"
|
|
|
|
results.append(self.parse_result(soup,"No title",url))
|
|
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
|
|
return results
|
|
|
|
|
|
def search_web(self, query: str) -> List[Dict]:
|
|
formatted_query = quote_plus(query)
|
|
url = f"https://html.duckduckgo.com/html/?q={formatted_query}"
|
|
|
|
search_results_soup = self.read_url(url)
|
|
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
|
|
logger.warning(f"Keine Suchergebnisse gefunden für: {query}")
|
|
return []
|
|
|
|
# Extract search results
|
|
results = []
|
|
|
|
# Find all result containers
|
|
result_elements = search_results_soup.select('.result')
|
|
|
|
for result in result_elements:
|
|
# Extract title
|
|
title_element = result.select_one('.result__a')
|
|
title = title_element.text.strip() if title_element else 'No title'
|
|
|
|
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
|
|
url_element = title_element.get('href') if title_element else ''
|
|
extracted_url = 'No URL'
|
|
|
|
if url_element:
|
|
# Extract the actual URL from DuckDuckGo's redirect
|
|
if url_element.startswith('/d.js?q='):
|
|
start = url_element.find('?q=') + 3 # Skip '?q='
|
|
end = url_element.find('&', start) if '&' in url_element[start:] else None
|
|
extracted_url = unquote(url_element[start:end])
|
|
|
|
# Make sure the URL has the correct protocol prefix
|
|
if not extracted_url.startswith(('http://', 'https://')):
|
|
if not extracted_url.startswith('//'):
|
|
extracted_url = 'https://' + extracted_url
|
|
else:
|
|
extracted_url = 'https:' + extracted_url
|
|
else:
|
|
extracted_url = url_element
|
|
|
|
# Extract snippet directly from search results page
|
|
snippet_element = result.select_one('.result__snippet')
|
|
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
|
|
|
# Now fetch the actual page content for the data field
|
|
target_page_soup = self.read_url(extracted_url)
|
|
|
|
results.append({
|
|
'title': title,
|
|
'url': extracted_url,
|
|
'snippet': snippet,
|
|
'data': str(target_page_soup) if isinstance(target_page_soup, BeautifulSoup) else "Error fetching page"
|
|
})
|
|
|
|
# Limit the number of results if needed
|
|
if len(results) >= self.max_result:
|
|
break
|
|
|
|
return results
|
|
|
|
|
|
def read_url(self, url: str) -> BeautifulSoup:
|
|
"""
|
|
Liest eine URL und gibt einen BeautifulSoup-Parser für den Inhalt zurück.
|
|
Bei Fehlern wird ein leeres BeautifulSoup-Objekt zurückgegeben.
|
|
|
|
Args:
|
|
url: Die zu lesende URL
|
|
|
|
Returns:
|
|
BeautifulSoup-Objekt mit dem Inhalt oder leer bei Fehlern
|
|
"""
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
}
|
|
|
|
try:
|
|
import time
|
|
|
|
# Initialer Request
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
|
|
# Polling für Status 202
|
|
if response.status_code == 202:
|
|
# Maximal 3 Versuche mit steigenden Intervallen
|
|
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, dann 1s, dann 2s
|
|
|
|
for wait_time in backoff_times:
|
|
time.sleep(wait_time) # Warten mit steigender Zeit
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
|
|
# Wenn kein 202 mehr, dann abbrechen
|
|
if response.status_code != 202:
|
|
break
|
|
|
|
# Für andere Fehler-Status einen Fehler auslösen
|
|
response.raise_for_status()
|
|
|
|
# HTML parsen
|
|
return BeautifulSoup(response.text, 'html.parser')
|
|
|
|
except Exception as e:
|
|
# Leeres BeautifulSoup-Objekt erstellen
|
|
return BeautifulSoup("<html><body></body></html>", 'html.parser')
|
|
|
|
|
|
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
|
|
# Extract snippet/description
|
|
snippet_element = data.select_one('.result__snippet')
|
|
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
|
|
|
result={
|
|
'title': title,
|
|
'url': url,
|
|
'snippet': snippet,
|
|
'data': data.prettify()
|
|
}
|
|
return result
|
|
|
|
|
|
def _old_scrape_url(self, url: str) -> str:
|
|
try:
|
|
logger.info(f"Requesting URL: {url}")
|
|
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
title = soup.title.string if soup.title else "No title"
|
|
for element in soup.select('script, style, meta, noscript, iframe, nav, footer, header, aside'):
|
|
element.extract()
|
|
main_content = ""
|
|
|
|
# Common content containers
|
|
content_selectors = [
|
|
'main', '#main', '.main',
|
|
'article', '.article',
|
|
'#content', '.content',
|
|
'.post', '#post',
|
|
'.entry-content', '.post-content',
|
|
'.page-content', '.article-content'
|
|
]
|
|
|
|
# Try each selector
|
|
for selector in content_selectors:
|
|
elements = soup.select(selector)
|
|
if elements:
|
|
main_content = elements[0].get_text(separator='\n', strip=True)
|
|
logger.info(f"Found content using selector: {selector}")
|
|
break
|
|
|
|
# If no main content found, use body text
|
|
if not main_content:
|
|
main_content = soup.body.get_text(separator='\n', strip=True)
|
|
logger.info("Using body text as no main content container found")
|
|
|
|
# Clean up the text
|
|
lines = []
|
|
for line in main_content.split('\n'):
|
|
line = line.strip()
|
|
if line and len(line) > 15: # Skip very short lines
|
|
lines.append(line)
|
|
|
|
main_content = '\n'.join(lines)
|
|
|
|
# Truncate if too long
|
|
if len(main_content) > self.max_content_length:
|
|
main_content = main_content[:self.max_content_length] + "...\n[Inhalt gekürzt]"
|
|
|
|
return main_content.strip()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Fehler beim Scrapen von {url}: {str(e)}")
|
|
return f"[Fehler beim Scrapen von {url}: {str(e)}]"
|
|
|
|
|
|
def _old_extract_urls_from_search_results(self, html_content: str) -> List[str]:
|
|
"""
|
|
Extracts URLs from search engine results.
|
|
|
|
Args:
|
|
html_content: HTML content of the search results page
|
|
|
|
Returns:
|
|
List of extracted URLs
|
|
"""
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
urls = []
|
|
|
|
# Different search engines have different HTML structures
|
|
# Google links
|
|
for a_tag in soup.select('a[href^="/url?"]'):
|
|
href = a_tag.get('href', '')
|
|
if '/url?q=' in href:
|
|
url = href.split('/url?q=')[1].split('&')[0]
|
|
url = urllib.parse.unquote(url)
|
|
if url.startswith('http') and url not in urls:
|
|
urls.append(url)
|
|
|
|
# Bing links
|
|
for a_tag in soup.select('a[href^="http"]'):
|
|
url = a_tag.get('href', '')
|
|
excluded_domains = getattr(self, 'excluded_domains', [])
|
|
if (url.startswith('http') and
|
|
not any(domain in url for domain in excluded_domains) and
|
|
url not in urls):
|
|
urls.append(url)
|
|
|
|
# Yahoo links
|
|
for a_tag in soup.select('a.d-ib'):
|
|
url = a_tag.get('href', '')
|
|
if url.startswith('http') and url not in urls:
|
|
urls.append(url)
|
|
|
|
# If no URLs found, try a more generic approach
|
|
if not urls:
|
|
for a_tag in soup.find_all('a', href=True):
|
|
url = a_tag['href']
|
|
excluded_domains = getattr(self, 'excluded_domains', [])
|
|
if (url.startswith('http') and
|
|
not any(domain in url for domain in excluded_domains) and
|
|
url not in urls):
|
|
urls.append(url)
|
|
|
|
# Limit the number of results
|
|
return urls[:self.max_urls]
|
|
|
|
|
|
|
|
|
|
# Singleton-Instanz
|
|
_webcrawler_agent = None
|
|
|
|
def get_webcrawler_agent():
|
|
"""Gibt eine Singleton-Instanz des WebCrawler-Agenten zurück"""
|
|
global _webcrawler_agent
|
|
if _webcrawler_agent is None:
|
|
_webcrawler_agent = WebcrawlerAgent()
|
|
return _webcrawler_agent |