import logging import re import requests from typing import List, Dict, Any, Optional, Tuple from bs4 import BeautifulSoup import json import os import configload as configload import urllib.parse import time import random import pandas as pd # Logger konfigurieren logger = logging.getLogger(__name__) # Konfigurationsdaten laden def load_config_data(): config = configload.load_config() # Get search engines as comma-separated list search_engines_str = config.get('Connector_AiWebscraping', 'SEARCH_ENGINES') search_engines = [engine.strip() for engine in search_engines_str.split(',')] # Get excluded domains as comma-separated list excluded_domains_str = config.get('Connector_AiWebscraping', 'EXCLUDED_DOMAINS') excluded_domains = [domain.strip() for domain in excluded_domains_str.split(',')] return { "timeout": int(config.get('Connector_AiWebscraping', 'TIMEOUT')), "max_urls": int(config.get('Connector_AiWebscraping', 'MAX_URLS')), "max_content_length": int(config.get('Connector_AiWebscraping', 'MAX_CONTENT_LENGTH')), "user_agent": config.get('Connector_AiWebscraping', 'USER_AGENT'), "search_engines": search_engines, "min_delay": float(config.get('Connector_AiWebscraping', 'MIN_DELAY')), "max_delay": float(config.get('Connector_AiWebscraping', 'MAX_DELAY')), "excluded_domains": excluded_domains, "max_search_results": int(config.get('Connector_AiWebscraping', 'MAX_SEARCH_RESULTS')) } class WebScrapingService: """ Connector für Web-Scraping-Funktionalitäten. """ def __init__(self): # Konfiguration laden self.config = load_config_data() # Konfigurationswerte zu Instance-Attributen zuweisen self.timeout = self.config["timeout"] self.max_urls = self.config["max_urls"] self.max_content_length = self.config["max_content_length"] self.user_agent = self.config["user_agent"] self.min_delay = self.config["min_delay"] self.max_delay = self.config["max_delay"] self.excluded_domains = self.config["excluded_domains"] self.max_search_results = self.config["max_search_results"] # Initialize search engines based on config self.search_engines = {} if "google" in self.config["search_engines"]: self.search_engines["google"] = "https://www.google.com/search?q={query}" if "bing" in self.config["search_engines"]: self.search_engines["bing"] = "https://www.bing.com/search?q={query}" # Headers for requests self.headers = { 'User-Agent': self.user_agent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Referer': 'https://www.google.com/', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } logger.info(f"WebScraping Connector initialisiert mit Timeout: {self.timeout}s") def scrape_url(self, url: str) -> str: """ Scrapt den Inhalt einer URL und extrahiert den relevanten Text. Args: url: Die zu scrapende URL Returns: Der extrahierte Inhalt Raises: Exception: Bei Fehlern im Scraping-Prozess """ try: logger.info(f"Requesting URL: {url}") response = requests.get(url, headers=self.headers, timeout=self.timeout) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Get page title title = soup.title.string if soup.title else "No title" # Remove unwanted elements for element in soup.select('script, style, meta, noscript, iframe, nav, footer, header, aside'): element.extract() # Try to find main content main_content = "" # Common content containers content_selectors = [ 'main', '#main', '.main', 'article', '.article', '#content', '.content', '.post', '#post', '.entry-content', '.post-content', '.page-content', '.article-content' ] # Try each selector for selector in content_selectors: elements = soup.select(selector) if elements: main_content = elements[0].get_text(separator='\n', strip=True) logger.info(f"Found content using selector: {selector}") break # If no main content found, use body text if not main_content: main_content = soup.body.get_text(separator='\n', strip=True) logger.info("Using body text as no main content container found") # Clean up the text lines = [] for line in main_content.split('\n'): line = line.strip() if line and len(line) > 15: # Skip very short lines lines.append(line) main_content = '\n'.join(lines) # Truncate if too long if len(main_content) > self.max_content_length: main_content = main_content[:self.max_content_length] + "...\n[Inhalt gekürzt]" # Add metadata result = f"# {title}\nURL: {url}\n\n{main_content}" return result.strip() except Exception as e: logger.error(f"Fehler beim Scrapen von {url}: {str(e)}") return f"[Fehler beim Scrapen von {url}: {str(e)}]" def extract_urls_from_search_results(self, html_content: str) -> List[str]: """ Extrahiert URLs aus den Suchergebnissen. Args: html_content: HTML der Suchergebnisseite Returns: Liste der gefundenen URLs """ soup = BeautifulSoup(html_content, 'html.parser') urls = [] # Different search engines have different HTML structures # Google links for a_tag in soup.select('a[href^="/url?"]'): href = a_tag.get('href', '') if '/url?q=' in href: url = href.split('/url?q=')[1].split('&')[0] url = urllib.parse.unquote(url) if url.startswith('http') and url not in urls: urls.append(url) # Bing links for a_tag in soup.select('a[href^="http"]'): url = a_tag.get('href', '') if (url.startswith('http') and not any(domain in url for domain in self.excluded_domains) and url not in urls): urls.append(url) # If no URLs found, try a more generic approach if not urls: for a_tag in soup.find_all('a', href=True): url = a_tag['href'] if (url.startswith('http') and not any(domain in url for domain in self.excluded_domains) and url not in urls): urls.append(url) return urls[:self.max_search_results] # Limit to max_search_results def extract_urls(self, text: str) -> List[str]: """ Extrahiert URLs aus einem Text. Args: text: Der zu analysierende Text Returns: Liste der gefundenen URLs """ # URL pattern with improved regex url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^)\s]*)?') found_urls = url_pattern.findall(text) # Basic URL cleanup and validation valid_urls = [] for url in found_urls: # Remove trailing punctuation url = re.sub(r'[.,;:!?]$', '', url) # Skip excluded domains if not any(domain in url for domain in self.excluded_domains): valid_urls.append(url) return valid_urls[:self.max_urls] # Limit to max_urls def extract_keywords(self, text: str) -> str: """ Extrahiert Schlüsselwörter aus einem Text. Args: text: Der zu analysierende Text Returns: Extrahierte Schlüsselwörter als String """ # Define German stopwords stopwords = [ "der", "die", "das", "den", "dem", "des", "ein", "eine", "einer", "eines", "einem", "einen", "und", "oder", "aber", "wenn", "weil", "obwohl", "für", "mit", "von", "zu", "aus", "bei", "nach", "über", "unter", "vor", "hinter", "neben", "zwischen", "nicht", "kein", "keine", "keiner", "keines", "keinem", "keinen", "ist", "sind", "war", "waren", "wird", "werden", "wurde", "wurden", "kann", "können", "darf", "dürfen", "soll", "sollen", "muss", "müssen", "hat", "haben", "dass", "noch", "schon", "auch", "nur", "sehr", "mehr", "durch", "gegen", "ohne", "um", "heute", "morgen", "gestern" ] # Normalize text text = text.lower() # Remove special characters and replace them with spaces text = re.sub(r'[^\w\s]', ' ', text) # Split into words words = text.split() # Filter words filtered_words = [] for word in words: if (len(word) > 3 and # Skip very short words word not in stopwords and not word.isdigit()): # Skip numbers filtered_words.append(word) # Get common words by frequency word_freq = {} for word in filtered_words: if word in word_freq: word_freq[word] += 1 else: word_freq[word] = 1 # Sort by frequency sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) # Take top 10 words keywords = [word for word, freq in sorted_words[:10]] return " ".join(keywords) async def search_web(self, query: str) -> List[str]: """ Führt eine Websuche mit den gegebenen Suchbegriffen durch. Args: query: Suchbegriffe Returns: Liste der gefundenen URLs """ # Choose a random search engine engine_name = random.choice(list(self.search_engines.keys())) search_url = self.search_engines[engine_name].format(query=urllib.parse.quote(query)) logger.info(f"Searching with {engine_name}: {query}") try: # Add a slight delay to avoid being blocked time.sleep(random.uniform(self.min_delay, self.max_delay)) response = requests.get( search_url, headers=self.headers, timeout=self.timeout ) if response.status_code == 200: # Extract URLs from search results urls = self.extract_urls_from_search_results(response.text) logger.info(f"Found {len(urls)} URLs from search results") return urls else: logger.warning(f"Search request failed with status code: {response.status_code}") return [] except Exception as e: logger.error(f"Error during web search: {e}") return [] # Enhanced connector_aiweb_webscraping.py modifications # Focus on the scrape_web_data method to ensure consistent behavior async def scrape_web_data(self, prompt: str) -> str: """ Enhanced web scraping function that ensures consistent behavior. Always performs scraping for prompts and returns structured results. Args: prompt: The user prompt Returns: Scraped web data as text """ try: # Ensure prompt is a string if isinstance(prompt, list): prompt = " ".join(prompt) if all(isinstance(item, str) for item in prompt) else str(prompt) elif not isinstance(prompt, str): prompt = str(prompt) # Log the scraping attempt logger.info(f"Starting web scraping with prompt: {prompt[:400]}...") # First check for explicit URLs in the prompt explicit_urls = self.extract_urls(prompt) # Always perform search, even if explicit URLs are found # This ensures more comprehensive results keywords = self.extract_keywords(prompt) logger.info(f"Using keywords for search: {keywords}") # Search for relevant URLs search_urls = await self.search_web(keywords) # Combine explicit URLs with search results, prioritizing explicit URLs urls = [] # Add explicit URLs first for url in explicit_urls: if url not in urls: urls.append(url) # Then add search results, avoiding duplicates for url in search_urls: if url not in urls: urls.append(url) # If no URLs found after both methods, try a simplified search if not urls: simplified_query = " ".join(prompt.split()[:8]) # Use first 8 words simplified_urls = await self.search_web(simplified_query) for url in simplified_urls: if url not in urls: urls.append(url) # Scrape content from URLs results = [] scraped_count = 0 if urls: logger.info(f"Found {len(urls)} URLs to scrape") for url in urls[:self.max_urls]: try: # Add a delay between requests time.sleep(random.uniform(self.min_delay, self.max_delay)) content = self.scrape_url(url) if content and len(content) > 100: # Ensure meaningful content results.append(content) scraped_count += 1 logger.info(f"Successfully scraped: {url}") else: logger.warning(f"Insufficient content from: {url}") except Exception as e: logger.error(f"Error scraping {url}: {e}") # Create the final result with improved structure if results: logger.info(f"Successfully scraped {scraped_count} pages") # Format the results in a structured way for better agent understanding structured_result = f"# Web Scraping Results\n\nScraped {scraped_count} web sources based on: \"{prompt}\"\n\n" for i, result in enumerate(results): structured_result += f"## Source {i+1}\n\n{result}\n\n---\n\n" return structured_result.strip() else: # If no real content was scraped, provide simulated data with clear indication logger.warning("No content scraped, using simulated data") simulated_data = f""" # Simulated Web Research Results for: {prompt} ## Notice The web scraping system was unable to retrieve real data from the web. The following information is provided as a placeholder to continue the workflow. ## Market Trends and Developments - Latest analyses show significant growth in digital transformation - Experts continue to forecast positive development for cloud-based solutions - Current technologies improve efficiency by an average of 23% ## Leading Companies in the Sector 1. TechInnovators GmbH - Market share 28% 2. FutureWave AG - Market share 22% 3. ProgressTech Ltd. - Market share 17% ## Innovations and New Products - Smart integration solutions for existing systems - AI-powered automation processes - Improved sustainability standards through new materials *Note: This is simulated data provided because no actual web scraping was possible.* """.strip() return simulated_data except Exception as e: logger.error(f"Error during web scraping: {e}") error_message = f"Web scraping could not be performed: {str(e)}" return error_message.strip() # Additional helper method to ensure the scraper agent always triggers web scraping async def ensure_scraper_agent_scraping(agent_type: str, moderator_text: str, prompt: str, aiweb_scraper) -> Tuple[bool, str]: """ Helper function to ensure scraper agent always triggers web scraping. To be called from the _run_moderator_cycle method when a scraper agent is selected. Args: agent_type: Type of the selected agent moderator_text: Text from the moderator prompt: The original prompt aiweb_scraper: Web scraper service instance Returns: Tuple of (was_scraping_performed, scraped_data) """ if agent_type != "scraper": return False, "" try: # Log that web scraping is being performed for scraper agent logger.info(f"Ensuring web scraping for scraper agent with prompt: {prompt[:100]}...") # Extract a search query from the moderator text if possible search_query = prompt if moderator_text: # Try to extract a more specific query from moderator instructions query_patterns = [ r"search for [\"'](.+?)[\"']", r"find information about [\"'](.+?)[\"']", r"research [\"'](.+?)[\"']", r"look up [\"'](.+?)[\"']" ] for pattern in query_patterns: match = re.search(pattern, moderator_text, re.IGNORECASE) if match: extracted_query = match.group(1) if len(extracted_query) > 10: # Ensure it's a meaningful query search_query = extracted_query logger.info(f"Extracted search query from moderator: {search_query}") break # Always perform the web scraping scraped_data = await aiweb_scraper.scrape_web_data(search_query) # Mark that scraping was performed return True, scraped_data except Exception as e: logger.error(f"Error ensuring web scraping for scraper agent: {e}") return True, f"Web scraping failed: {str(e)}" async def close(self): """ Schließt alle offenen Ressourcen. """ # Currently no resources to close pass