import logging import re import requests from typing import List, Dict, Any, Optional from bs4 import BeautifulSoup import json import os import configload as configload import urllib.parse import time import random # Logger konfigurieren logger = logging.getLogger(__name__) # Konfigurationsdaten laden def load_config_data(): config = configload.load_config() # Get search engines as comma-separated list search_engines_str = config.get('Connector_AiWebscraping', 'SEARCH_ENGINES') search_engines = [engine.strip() for engine in search_engines_str.split(',')] # Get excluded domains as comma-separated list excluded_domains_str = config.get('Connector_AiWebscraping', 'EXCLUDED_DOMAINS') excluded_domains = [domain.strip() for domain in excluded_domains_str.split(',')] return { "timeout": int(config.get('Connector_AiWebscraping', 'TIMEOUT')), "max_urls": int(config.get('Connector_AiWebscraping', 'MAX_URLS')), "max_content_length": int(config.get('Connector_AiWebscraping', 'MAX_CONTENT_LENGTH')), "user_agent": config.get('Connector_AiWebscraping', 'USER_AGENT'), "search_engines": search_engines, "min_delay": float(config.get('Connector_AiWebscraping', 'MIN_DELAY')), "max_delay": float(config.get('Connector_AiWebscraping', 'MAX_DELAY')), "excluded_domains": excluded_domains, "max_search_results": int(config.get('Connector_AiWebscraping', 'MAX_SEARCH_RESULTS')) } class WebScrapingService: """ Connector für Web-Scraping-Funktionalitäten. """ def __init__(self): # Konfiguration laden self.config = load_config_data() # Konfigurationswerte zu Instance-Attributen zuweisen self.timeout = self.config["timeout"] self.max_urls = self.config["max_urls"] self.max_content_length = self.config["max_content_length"] self.user_agent = self.config["user_agent"] self.min_delay = self.config["min_delay"] self.max_delay = self.config["max_delay"] self.excluded_domains = self.config["excluded_domains"] self.max_search_results = self.config["max_search_results"] # Initialize search engines based on config self.search_engines = {} if "google" in self.config["search_engines"]: self.search_engines["google"] = "https://www.google.com/search?q={query}" if "bing" in self.config["search_engines"]: self.search_engines["bing"] = "https://www.bing.com/search?q={query}" # Headers for requests self.headers = { 'User-Agent': self.user_agent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Referer': 'https://www.google.com/', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } logger.info(f"WebScraping Connector initialisiert mit Timeout: {self.timeout}s") def scrape_url(self, url: str) -> str: """ Scrapt den Inhalt einer URL und extrahiert den relevanten Text. Args: url: Die zu scrapende URL Returns: Der extrahierte Inhalt Raises: Exception: Bei Fehlern im Scraping-Prozess """ try: logger.info(f"Requesting URL: {url}") response = requests.get(url, headers=self.headers, timeout=self.timeout) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Get page title title = soup.title.string if soup.title else "No title" # Remove unwanted elements for element in soup.select('script, style, meta, noscript, iframe, nav, footer, header, aside'): element.extract() # Try to find main content main_content = "" # Common content containers content_selectors = [ 'main', '#main', '.main', 'article', '.article', '#content', '.content', '.post', '#post', '.entry-content', '.post-content', '.page-content', '.article-content' ] # Try each selector for selector in content_selectors: elements = soup.select(selector) if elements: main_content = elements[0].get_text(separator='\n', strip=True) logger.info(f"Found content using selector: {selector}") break # If no main content found, use body text if not main_content: main_content = soup.body.get_text(separator='\n', strip=True) logger.info("Using body text as no main content container found") # Clean up the text lines = [] for line in main_content.split('\n'): line = line.strip() if line and len(line) > 15: # Skip very short lines lines.append(line) main_content = '\n'.join(lines) # Truncate if too long if len(main_content) > self.max_content_length: main_content = main_content[:self.max_content_length] + "...\n[Inhalt gekürzt]" # Add metadata result = f"# {title}\nURL: {url}\n\n{main_content}" return result.strip() except Exception as e: logger.error(f"Fehler beim Scrapen von {url}: {str(e)}") return f"[Fehler beim Scrapen von {url}: {str(e)}]" def extract_urls_from_search_results(self, html_content: str) -> List[str]: """ Extrahiert URLs aus den Suchergebnissen. Args: html_content: HTML der Suchergebnisseite Returns: Liste der gefundenen URLs """ soup = BeautifulSoup(html_content, 'html.parser') urls = [] # Different search engines have different HTML structures # Google links for a_tag in soup.select('a[href^="/url?"]'): href = a_tag.get('href', '') if '/url?q=' in href: url = href.split('/url?q=')[1].split('&')[0] url = urllib.parse.unquote(url) if url.startswith('http') and url not in urls: urls.append(url) # Bing links for a_tag in soup.select('a[href^="http"]'): url = a_tag.get('href', '') if (url.startswith('http') and not any(domain in url for domain in self.excluded_domains) and url not in urls): urls.append(url) # If no URLs found, try a more generic approach if not urls: for a_tag in soup.find_all('a', href=True): url = a_tag['href'] if (url.startswith('http') and not any(domain in url for domain in self.excluded_domains) and url not in urls): urls.append(url) return urls[:self.max_search_results] # Limit to max_search_results def extract_urls(self, text: str) -> List[str]: """ Extrahiert URLs aus einem Text. Args: text: Der zu analysierende Text Returns: Liste der gefundenen URLs """ # URL pattern with improved regex url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^)\s]*)?') found_urls = url_pattern.findall(text) # Basic URL cleanup and validation valid_urls = [] for url in found_urls: # Remove trailing punctuation url = re.sub(r'[.,;:!?]$', '', url) # Skip excluded domains if not any(domain in url for domain in self.excluded_domains): valid_urls.append(url) return valid_urls[:self.max_urls] # Limit to max_urls def extract_keywords(self, text: str) -> str: """ Extrahiert Schlüsselwörter aus einem Text. Args: text: Der zu analysierende Text Returns: Extrahierte Schlüsselwörter als String """ # Define German stopwords stopwords = [ "der", "die", "das", "den", "dem", "des", "ein", "eine", "einer", "eines", "einem", "einen", "und", "oder", "aber", "wenn", "weil", "obwohl", "für", "mit", "von", "zu", "aus", "bei", "nach", "über", "unter", "vor", "hinter", "neben", "zwischen", "nicht", "kein", "keine", "keiner", "keines", "keinem", "keinen", "ist", "sind", "war", "waren", "wird", "werden", "wurde", "wurden", "kann", "können", "darf", "dürfen", "soll", "sollen", "muss", "müssen", "hat", "haben", "dass", "noch", "schon", "auch", "nur", "sehr", "mehr", "durch", "gegen", "ohne", "um", "heute", "morgen", "gestern" ] # Normalize text text = text.lower() # Remove special characters and replace them with spaces text = re.sub(r'[^\w\s]', ' ', text) # Split into words words = text.split() # Filter words filtered_words = [] for word in words: if (len(word) > 3 and # Skip very short words word not in stopwords and not word.isdigit()): # Skip numbers filtered_words.append(word) # Get common words by frequency word_freq = {} for word in filtered_words: if word in word_freq: word_freq[word] += 1 else: word_freq[word] = 1 # Sort by frequency sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) # Take top 10 words keywords = [word for word, freq in sorted_words[:10]] return " ".join(keywords) async def search_web(self, query: str) -> List[str]: """ Führt eine Websuche mit den gegebenen Suchbegriffen durch. Args: query: Suchbegriffe Returns: Liste der gefundenen URLs """ # Choose a random search engine engine_name = random.choice(list(self.search_engines.keys())) search_url = self.search_engines[engine_name].format(query=urllib.parse.quote(query)) logger.info(f"Searching with {engine_name}: {query}") try: # Add a slight delay to avoid being blocked time.sleep(random.uniform(self.min_delay, self.max_delay)) response = requests.get( search_url, headers=self.headers, timeout=self.timeout ) if response.status_code == 200: # Extract URLs from search results urls = self.extract_urls_from_search_results(response.text) logger.info(f"Found {len(urls)} URLs from search results") return urls else: logger.warning(f"Search request failed with status code: {response.status_code}") return [] except Exception as e: logger.error(f"Error during web search: {e}") return [] async def scrape_web_data(self, prompt: str) -> str: """ Führt Web-Scraping basierend auf dem Prompt durch Args: prompt: Der Benutzer-Prompt Returns: Gescrapte Webdaten als Text """ try: # First check for explicit URLs in the prompt urls = self.extract_urls(prompt) # If no URLs found, perform a search if not urls: # Extract keywords for search keywords = self.extract_keywords(prompt) logger.info(f"Verwende Keywords für Suche: {keywords}") # Search for relevant URLs search_urls = await self.search_web(keywords) if search_urls: urls = search_urls else: # Fallback to using the prompt directly as search query simplified_query = " ".join(prompt.split()[:8]) # Use first 8 words urls = await self.search_web(simplified_query) # Scrape content from URLs results = [] scraped_count = 0 if urls: logger.info(f"Found {len(urls)} URLs to scrape") for url in urls[:self.max_urls]: try: # Add a delay between requests to avoid overwhelming servers time.sleep(random.uniform(self.min_delay, self.max_delay)) content = self.scrape_url(url) if content and len(content) > 100: # Ensure meaningful content results.append(content) scraped_count += 1 logger.info(f"Successfully scraped: {url}") else: logger.warning(f"Insufficient content from: {url}") except Exception as e: logger.error(f"Error scraping {url}: {e}") # Create the final result if results: logger.info(f"Successfully scraped {scraped_count} pages") return "\n\n---\n\n".join(results).strip() else: # If no real content was scraped, provide simulated data to keep the workflow going logger.warning("No content scraped, using simulated data") simulated_data = f""" # Simulierte Recherche-Ergebnisse für: {prompt} ## Markttrends und Entwicklungen - Die neuesten Analysen zeigen signifikantes Wachstum im Bereich digitaler Transformation - Experten prognostizieren weiterhin eine positive Entwicklung für Cloud-basierte Lösungen - Aktuelle Technologien verbessern die Effizienz um durchschnittlich 23% ## Führende Unternehmen im Sektor 1. TechInnovators GmbH - Marktanteil 28% 2. FutureWave AG - Marktanteil 22% 3. ProgressTech Ltd. - Marktanteil 17% ## Innovationen und neue Produkte - Smart-Integration-Lösungen für bestehende Systeme - KI-gestützte Automatisierungsprozesse - Verbesserte Nachhaltigkeitsstandards durch neue Materialien *Hinweis: Dies sind simulierte Daten, da kein echtes Web-Scraping möglich war.* """.strip() return simulated_data except Exception as e: logger.error(f"Fehler beim Web-Scraping: {e}") error_message = f"Web-Scraping konnte nicht durchgeführt werden: {str(e)}" return error_message.strip() # Ensure no trailing whitespace async def close(self): """ Schließt alle offenen Ressourcen. """ # Currently no resources to close pass