""" WebCrawler-Agent für die Recherche und Beschaffung von Informationen aus dem Web. """ import json import logging import random import time from typing import List, Dict, Any, Optional import urllib from urllib.parse import quote_plus, unquote from bs4 import BeautifulSoup import requests from modules.agentservice_base import BaseAgent from connectors.connector_aichat_openai import ChatService logger = logging.getLogger(__name__) class WebcrawlerAgent(BaseAgent): """Agent für Web-Recherche und Informationsbeschaffung""" _instance = None chat_service = ChatService() #INIT --> should go to config max_url=3 max_key=3 max_result=3 timeout = 10 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Referer': 'https://www.google.com/', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } max_urls = 10 max_content_length=100000 @classmethod def get_instance(cls): """Gibt eine Singleton-Instanz zurück""" if cls._instance is None: cls._instance = cls() return cls._instance def __init__(self): """Initialisiert den WebCrawler-Agenten""" super().__init__() self.id = "webcrawler_agent" self.name = "Webscraper" self.type = "scraper" self.description = "Recherchiert Informationen im Web" self.capabilities = "Informationsrecherche, Datenbeschaffung aus dem Web, Quellenbewertung und Zusammenführung von Online-Informationen" self.instructions = "" async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]: try: # Führe die Web-Recherche durch und warte auf das Ergebnis mit await web_query_result = await self.get_web_query(message) # Antwort-Objekt erstellen response = { "role": "assistant", "content": f"{web_query_result} [STATUS: ERGEBNIS]", "agent_type": self.type } # Extrahiere den Status aus der Antwort und aktualisiere den Inhalt content, status = self.extract_status(response["content"]) response["content"] = content # Setze den Status im Kontext, falls vorhanden if context is not None: context["status"] = status return response except Exception as e: logger.error(f"Fehler bei der Web-Recherche: {str(e)}", exc_info=True) # Fehlerantwort zurückgeben return { "role": "assistant", "content": f"Bei der Web-Recherche ist ein Fehler aufgetreten: {str(e)}", "agent_type": self.type } async def get_web_query(self, message_context: Dict[str, Any]) -> str: prompt = await self.get_prompt(message_context) result_json = await self.run_web_query(prompt) result_data = "" summary_src = "" logger.info(f"Web analysis prompt '{prompt}' delivers {len(result_json)} results.") if isinstance(result_json, list): for i, result in enumerate(result_json, 1): web_answer_instructions = f""" Fass das Resultat gemäss dem Auftrag zusammen in maximal rund 2000 Zeichen. Auftrag = '{prompt.replace("'","")}' Fasse die wichtigsten Erkenntnisse zusammen und setze sie in Bezug zur ursprünglichen Anfrage. Die Einleitung kannst Du weglassen. Achte darauf, nur relevante und qualitativ hochwertige Informationen zu extrahieren, welche einen Bezug zum Auftrag haben, und übersichtlich zu präsentieren. Vermittle ein ausgewogenes Bild der recherchierten Informationen. Dies ist das Resultat: {result['data']} """ # Zusätzliche Anweisungen für Web-Recherche content_text = await self.chat_service.call_api( messages=[ { "role": "system", "content": "Du bist ein Informationsanalyst, der Webinhalte präzise und relevant zusammenfasst." }, { "role": "user", "content": web_answer_instructions } ] ) result_data += f"\n\n[{i}] {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\nContent: {content_text}" summary_src+=f"\n{content_text}" else: result_data = "no data received" logger.info(f"Web analysis result sent {len(result_data)}B") # Zusätzliche Zusammenfassung summary="" if len(summary_src)>1: summary = await self.chat_service.call_api( messages=[ { "role": "system", "content": "Du erstellst prägnante Zusammenfassungen von Rechercheergbnissen." }, { "role": "user", "content": f"Bitte fasse diese Erkenntnisse in maximal 5-6 Sätzen zusammen: {summary_src}\n" } ] ) result = f"{summary}\n\n{result_data}" return result async def get_prompt(self, message_context: Dict[str, Any]) -> str: task = message_context.get("content", "") return task.strip() async def run_web_query(self, prompt: str) -> List[Dict]: if prompt=="": return [] ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open. 'url': A list of maximum {self.max_url} specific URLs extracted from the task string. 'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information. Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition. """ content_text = await self.chat_service.call_api( messages=[ { "role": "system", "content": "Du bist ein Webrecherche-Experte, der präzise Suchstrategien entwickelt." }, { "role": "user", "content": ptext } ] ) # Remove markdown formatting if present if content_text.startswith("```json"): # Find the end of the JSON block end_marker = "```" end_index = content_text.rfind(end_marker) if end_index != -1: # Extract the JSON content without the markdown markers content_text = content_text[7:end_index].strip() # Now parse the JSON try: logger.info(f"Valid json received: {str(content_text)}") pjson = json.loads(content_text) # Now call scrape_json with the parsed dictionary result_json = await self.scrape_json(pjson) return result_json except json.JSONDecodeError as e: logger.error(f"Failed to parse JSON: {e}") logger.error(f"Cleaned content: {content_text[:100]}...") return [] async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]: """ Scrapes web content based on a research strategy JSON. Args: research_strategy: A dictionary containing: - 'skey': List of search keywords - 'url': List of direct URLs to scrape Returns: Dictionary with URLs as keys and scraped content as values """ logger.info("Starting JSON-based web scraping") results = [] # Validate input structure if not isinstance(research_strategy, dict): logger.error("Invalid research_strategy format: not a dictionary") return {"error": "Invalid research_strategy format: not a dictionary"} keys = research_strategy.get("skey", []) direct_urls = research_strategy.get("url", []) if not isinstance(keys, list) or not isinstance(direct_urls, list): logger.error("Invalid research_strategy format: keys, or url is not a list") return {"error": "Invalid research_strategy format: keys, or url is not a list"} # Process search keywords through search engine for keyword in keys: logger.info(f"Processing keyword: {keyword}") found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data logger.info(f"... {len(found_results)} results found") results.extend(found_results) # Process direct URLs logger.info(f"Processing {len(direct_urls)} direct URLs") for url in direct_urls: if url in results: logger.info(f"Skipping already scraped URL: {url}") continue soup=self.read_url(url) # Extract title from the page if it exists if isinstance(soup, BeautifulSoup): title_tag = soup.find('title') title = title_tag.text.strip() if title_tag else "No title" # Alternative: You could also look for h1 tags if the title tag is missing if title == "No title": h1_tag = soup.find('h1') if h1_tag: title = h1_tag.text.strip() else: # Handle the case where soup is an error message string title = "Error fetching page" results.append(self.parse_result(soup,"No title",url)) logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total") return results def search_web(self, query: str) -> List[Dict]: formatted_query = quote_plus(query) url = f"https://html.duckduckgo.com/html/?q={formatted_query}" search_results_soup = self.read_url(url) if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0: logger.warning(f"Keine Suchergebnisse gefunden für: {query}") return [] # Extract search results results = [] # Find all result containers result_elements = search_results_soup.select('.result') for result in result_elements: # Extract title title_element = result.select_one('.result__a') title = title_element.text.strip() if title_element else 'No title' # Extract URL (DuckDuckGo uses redirects, need to extract from href param) url_element = title_element.get('href') if title_element else '' extracted_url = 'No URL' if url_element: # Extract the actual URL from DuckDuckGo's redirect if url_element.startswith('/d.js?q='): start = url_element.find('?q=') + 3 # Skip '?q=' end = url_element.find('&', start) if '&' in url_element[start:] else None extracted_url = unquote(url_element[start:end]) # Make sure the URL has the correct protocol prefix if not extracted_url.startswith(('http://', 'https://')): if not extracted_url.startswith('//'): extracted_url = 'https://' + extracted_url else: extracted_url = 'https:' + extracted_url else: extracted_url = url_element # Extract snippet directly from search results page snippet_element = result.select_one('.result__snippet') snippet = snippet_element.text.strip() if snippet_element else 'No description' # Now fetch the actual page content for the data field target_page_soup = self.read_url(extracted_url) results.append({ 'title': title, 'url': extracted_url, 'snippet': snippet, 'data': str(target_page_soup) if isinstance(target_page_soup, BeautifulSoup) else "Error fetching page" }) # Limit the number of results if needed if len(results) >= self.max_result: break return results def read_url(self, url: str) -> BeautifulSoup: """ Liest eine URL und gibt einen BeautifulSoup-Parser für den Inhalt zurück. Bei Fehlern wird ein leeres BeautifulSoup-Objekt zurückgegeben. Args: url: Die zu lesende URL Returns: BeautifulSoup-Objekt mit dem Inhalt oder leer bei Fehlern """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml', 'Accept-Language': 'en-US,en;q=0.9', } try: import time # Initialer Request response = requests.get(url, headers=headers, timeout=10) # Polling für Status 202 if response.status_code == 202: # Maximal 3 Versuche mit steigenden Intervallen backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, dann 1s, dann 2s for wait_time in backoff_times: time.sleep(wait_time) # Warten mit steigender Zeit response = requests.get(url, headers=headers, timeout=10) # Wenn kein 202 mehr, dann abbrechen if response.status_code != 202: break # Für andere Fehler-Status einen Fehler auslösen response.raise_for_status() # HTML parsen return BeautifulSoup(response.text, 'html.parser') except Exception as e: # Leeres BeautifulSoup-Objekt erstellen return BeautifulSoup("", 'html.parser') def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]: # Extract snippet/description snippet_element = data.select_one('.result__snippet') snippet = snippet_element.text.strip() if snippet_element else 'No description' result={ 'title': title, 'url': url, 'snippet': snippet, 'data': data.prettify() } return result def _old_scrape_url(self, url: str) -> str: try: logger.info(f"Requesting URL: {url}") response = requests.get(url, headers=self.headers, timeout=self.timeout) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') title = soup.title.string if soup.title else "No title" for element in soup.select('script, style, meta, noscript, iframe, nav, footer, header, aside'): element.extract() main_content = "" # Common content containers content_selectors = [ 'main', '#main', '.main', 'article', '.article', '#content', '.content', '.post', '#post', '.entry-content', '.post-content', '.page-content', '.article-content' ] # Try each selector for selector in content_selectors: elements = soup.select(selector) if elements: main_content = elements[0].get_text(separator='\n', strip=True) logger.info(f"Found content using selector: {selector}") break # If no main content found, use body text if not main_content: main_content = soup.body.get_text(separator='\n', strip=True) logger.info("Using body text as no main content container found") # Clean up the text lines = [] for line in main_content.split('\n'): line = line.strip() if line and len(line) > 15: # Skip very short lines lines.append(line) main_content = '\n'.join(lines) # Truncate if too long if len(main_content) > self.max_content_length: main_content = main_content[:self.max_content_length] + "...\n[Inhalt gekürzt]" return main_content.strip() except Exception as e: logger.error(f"Fehler beim Scrapen von {url}: {str(e)}") return f"[Fehler beim Scrapen von {url}: {str(e)}]" def _old_extract_urls_from_search_results(self, html_content: str) -> List[str]: """ Extracts URLs from search engine results. Args: html_content: HTML content of the search results page Returns: List of extracted URLs """ soup = BeautifulSoup(html_content, 'html.parser') urls = [] # Different search engines have different HTML structures # Google links for a_tag in soup.select('a[href^="/url?"]'): href = a_tag.get('href', '') if '/url?q=' in href: url = href.split('/url?q=')[1].split('&')[0] url = urllib.parse.unquote(url) if url.startswith('http') and url not in urls: urls.append(url) # Bing links for a_tag in soup.select('a[href^="http"]'): url = a_tag.get('href', '') excluded_domains = getattr(self, 'excluded_domains', []) if (url.startswith('http') and not any(domain in url for domain in excluded_domains) and url not in urls): urls.append(url) # Yahoo links for a_tag in soup.select('a.d-ib'): url = a_tag.get('href', '') if url.startswith('http') and url not in urls: urls.append(url) # If no URLs found, try a more generic approach if not urls: for a_tag in soup.find_all('a', href=True): url = a_tag['href'] excluded_domains = getattr(self, 'excluded_domains', []) if (url.startswith('http') and not any(domain in url for domain in excluded_domains) and url not in urls): urls.append(url) # Limit the number of results return urls[:self.max_urls] # Singleton-Instanz _webcrawler_agent = None def get_webcrawler_agent(): """Gibt eine Singleton-Instanz des WebCrawler-Agenten zurück""" global _webcrawler_agent if _webcrawler_agent is None: _webcrawler_agent = WebcrawlerAgent() return _webcrawler_agent