gateway/gwserver/connector_aiweb_webscraping.py

import logging
import re
import requests
from typing import List, Dict, Any, Optional
from bs4 import BeautifulSoup
import json
import os
import configload as configload
import urllib.parse
import time
import random
import pandas as pd

# Logger konfigurieren
logger = logging.getLogger(__name__)

# Konfigurationsdaten laden
def load_config_data():
    config = configload.load_config()

    # Get search engines as comma-separated list
    search_engines_str = config.get('Connector_AiWebscraping', 'SEARCH_ENGINES')
    search_engines = [engine.strip() for engine in search_engines_str.split(',')]

    # Get excluded domains as comma-separated list
    excluded_domains_str = config.get('Connector_AiWebscraping', 'EXCLUDED_DOMAINS')
    excluded_domains = [domain.strip() for domain in excluded_domains_str.split(',')]

    return {
        "timeout": int(config.get('Connector_AiWebscraping', 'TIMEOUT')),
        "max_urls": int(config.get('Connector_AiWebscraping', 'MAX_URLS')),
        "max_content_length": int(config.get('Connector_AiWebscraping', 'MAX_CONTENT_LENGTH')),
        "user_agent": config.get('Connector_AiWebscraping', 'USER_AGENT'),
        "search_engines": search_engines,
        "min_delay": float(config.get('Connector_AiWebscraping', 'MIN_DELAY')),
        "max_delay": float(config.get('Connector_AiWebscraping', 'MAX_DELAY')),
        "excluded_domains": excluded_domains,
        "max_search_results": int(config.get('Connector_AiWebscraping', 'MAX_SEARCH_RESULTS'))
    }

class WebScrapingService:
    """
    Connector für Web-Scraping-Funktionalitäten.
    """

    def __init__(self):
        # Konfiguration laden
        self.config = load_config_data()

        # Konfigurationswerte zu Instance-Attributen zuweisen
        self.timeout = self.config["timeout"]
        self.max_urls = self.config["max_urls"]
        self.max_content_length = self.config["max_content_length"]
        self.user_agent = self.config["user_agent"]
        self.min_delay = self.config["min_delay"]
        self.max_delay = self.config["max_delay"]
        self.excluded_domains = self.config["excluded_domains"]
        self.max_search_results = self.config["max_search_results"]

        # Initialize search engines based on config
        self.search_engines = {}
        if "google" in self.config["search_engines"]:
            self.search_engines["google"] = "https://www.google.com/search?q={query}"
        if "bing" in self.config["search_engines"]:
            self.search_engines["bing"] = "https://www.bing.com/search?q={query}"

        # Headers for requests
        self.headers = {
            'User-Agent': self.user_agent,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.google.com/',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }

        logger.info(f"WebScraping Connector initialisiert mit Timeout: {self.timeout}s")

    def scrape_url(self, url: str) -> str:
        """
        Scrapt den Inhalt einer URL und extrahiert den relevanten Text.

        Args:
            url: Die zu scrapende URL

        Returns:
            Der extrahierte Inhalt

        Raises:
            Exception: Bei Fehlern im Scraping-Prozess
        """
        try:
            logger.info(f"Requesting URL: {url}")
            response = requests.get(url, headers=self.headers, timeout=self.timeout)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Get page title
            title = soup.title.string if soup.title else "No title"

            # Remove unwanted elements
            for element in soup.select('script, style, meta, noscript, iframe, nav, footer, header, aside'):
                element.extract()

            # Try to find main content
            main_content = ""

            # Common content containers
            content_selectors = [
                'main', '#main', '.main',
                'article', '.article',
                '#content', '.content',
                '.post', '#post',
                '.entry-content', '.post-content',
                '.page-content', '.article-content'
            ]

            # Try each selector
            for selector in content_selectors:
                elements = soup.select(selector)
                if elements:
                    main_content = elements[0].get_text(separator='\n', strip=True)
                    logger.info(f"Found content using selector: {selector}")
                    break

            # If no main content found, use body text
            if not main_content:
                main_content = soup.body.get_text(separator='\n', strip=True)
                logger.info("Using body text as no main content container found")

            # Clean up the text
            lines = []
            for line in main_content.split('\n'):
                line = line.strip()
                if line and len(line) > 15:  # Skip very short lines
                    lines.append(line)

            main_content = '\n'.join(lines)

            # Truncate if too long
            if len(main_content) > self.max_content_length:
                main_content = main_content[:self.max_content_length] + "...\n[Inhalt gekürzt]"

            # Add metadata
            result = f"# {title}\nURL: {url}\n\n{main_content}"

            return result.strip()

        except Exception as e:
            logger.error(f"Fehler beim Scrapen von {url}: {str(e)}")
            return f"[Fehler beim Scrapen von {url}: {str(e)}]"

    def extract_urls_from_search_results(self, html_content: str) -> List[str]:
        """
        Extrahiert URLs aus den Suchergebnissen.

        Args:
            html_content: HTML der Suchergebnisseite

        Returns:
            Liste der gefundenen URLs
        """
        soup = BeautifulSoup(html_content, 'html.parser')
        urls = []

        # Different search engines have different HTML structures
        # Google links
        for a_tag in soup.select('a[href^="/url?"]'):
            href = a_tag.get('href', '')
            if '/url?q=' in href:
                url = href.split('/url?q=')[1].split('&')[0]
                url = urllib.parse.unquote(url)
                if url.startswith('http') and url not in urls:
                    urls.append(url)

        # Bing links
        for a_tag in soup.select('a[href^="http"]'):
            url = a_tag.get('href', '')
            if (url.startswith('http') and
                not any(domain in url for domain in self.excluded_domains) and
                url not in urls):
                urls.append(url)

        # If no URLs found, try a more generic approach
        if not urls:
            for a_tag in soup.find_all('a', href=True):
                url = a_tag['href']
                if (url.startswith('http') and
                    not any(domain in url for domain in self.excluded_domains) and
                    url not in urls):
                    urls.append(url)

        return urls[:self.max_search_results]  # Limit to max_search_results

    def extract_urls(self, text: str) -> List[str]:
        """
        Extrahiert URLs aus einem Text.

        Args:
            text: Der zu analysierende Text

        Returns:
            Liste der gefundenen URLs
        """
        # URL pattern with improved regex
        url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^)\s]*)?')
        found_urls = url_pattern.findall(text)

        # Basic URL cleanup and validation
        valid_urls = []
        for url in found_urls:
            # Remove trailing punctuation
            url = re.sub(r'[.,;:!?]$', '', url)

            # Skip excluded domains
            if not any(domain in url for domain in self.excluded_domains):
                valid_urls.append(url)

        return valid_urls[:self.max_urls]  # Limit to max_urls

    def extract_keywords(self, text: str) -> str:
        """
        Extrahiert Schlüsselwörter aus einem Text.

        Args:
            text: Der zu analysierende Text

        Returns:
            Extrahierte Schlüsselwörter als String
        """
        # Define German stopwords
        stopwords = [
            "der", "die", "das", "den", "dem", "des",
            "ein", "eine", "einer", "eines", "einem", "einen",
            "und", "oder", "aber", "wenn", "weil", "obwohl",
            "für", "mit", "von", "zu", "aus", "bei", "nach",
            "über", "unter", "vor", "hinter", "neben", "zwischen",
            "nicht", "kein", "keine", "keiner", "keines", "keinem", "keinen",
            "ist", "sind", "war", "waren", "wird", "werden", "wurde", "wurden",
            "kann", "können", "darf", "dürfen", "soll", "sollen", "muss", "müssen",
            "hat", "haben", "dass", "noch", "schon", "auch", "nur", "sehr", "mehr",
            "durch", "gegen", "ohne", "um", "heute", "morgen", "gestern"
        ]

        # Normalize text
        text = text.lower()

        # Remove special characters and replace them with spaces
        text = re.sub(r'[^\w\s]', ' ', text)

        # Split into words
        words = text.split()

        # Filter words
        filtered_words = []
        for word in words:
            if (len(word) > 3 and  # Skip very short words
                word not in stopwords and
                not word.isdigit()):  # Skip numbers
                filtered_words.append(word)

        # Get common words by frequency
        word_freq = {}
        for word in filtered_words:
            if word in word_freq:
                word_freq[word] += 1
            else:
                word_freq[word] = 1

        # Sort by frequency
        sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

        # Take top 10 words
        keywords = [word for word, freq in sorted_words[:10]]

        return " ".join(keywords)

    async def search_web(self, query: str) -> List[str]:
        """
        Führt eine Websuche mit den gegebenen Suchbegriffen durch.

        Args:
            query: Suchbegriffe

        Returns:
            Liste der gefundenen URLs
        """
        # Choose a random search engine
        engine_name = random.choice(list(self.search_engines.keys()))
        search_url = self.search_engines[engine_name].format(query=urllib.parse.quote(query))

        logger.info(f"Searching with {engine_name}: {query}")

        try:
            # Add a slight delay to avoid being blocked
            time.sleep(random.uniform(self.min_delay, self.max_delay))

            response = requests.get(
                search_url,
                headers=self.headers,
                timeout=self.timeout
            )

            if response.status_code == 200:
                # Extract URLs from search results
                urls = self.extract_urls_from_search_results(response.text)
                logger.info(f"Found {len(urls)} URLs from search results")
                return urls
            else:
                logger.warning(f"Search request failed with status code: {response.status_code}")
                return []
        except Exception as e:
            logger.error(f"Error during web search: {e}")
            return []

    async def scrape_web_data(self, prompt: str) -> str:
        """
        Führt Web-Scraping basierend auf dem Prompt durch

        Args:
            prompt: Der Benutzer-Prompt

        Returns:
            Gescrapte Webdaten als Text
        """
        try:
            # First check for explicit URLs in the prompt
            urls = self.extract_urls(prompt)

            # If no URLs found, perform a search
            if not urls:
                # Extract keywords for search
                keywords = self.extract_keywords(prompt)
                logger.info(f"Verwende Keywords für Suche: {keywords}")

                # Search for relevant URLs
                search_urls = await self.search_web(keywords)

                if search_urls:
                    urls = search_urls
                else:
                    # Fallback to using the prompt directly as search query
                    simplified_query = " ".join(prompt.split()[:8])  # Use first 8 words
                    urls = await self.search_web(simplified_query)

            # Scrape content from URLs
            results = []
            scraped_count = 0

            if urls:
                logger.info(f"Found {len(urls)} URLs to scrape")

                for url in urls[:self.max_urls]:
                    try:
                        # Add a delay between requests to avoid overwhelming servers
                        time.sleep(random.uniform(self.min_delay, self.max_delay))

                        content = self.scrape_url(url)
                        if content and len(content) > 100:  # Ensure meaningful content
                            results.append(content)
                            scraped_count += 1
                            logger.info(f"Successfully scraped: {url}")
                        else:
                            logger.warning(f"Insufficient content from: {url}")
                    except Exception as e:
                        logger.error(f"Error scraping {url}: {e}")

            # Create the final result
            if results:
                logger.info(f"Successfully scraped {scraped_count} pages")
                return "\n\n---\n\n".join(results).strip()
            else:
                # If no real content was scraped, provide simulated data to keep the workflow going
                logger.warning("No content scraped, using simulated data")

                simulated_data = f"""
# Simulierte Recherche-Ergebnisse für: {prompt}

## Markttrends und Entwicklungen
- Die neuesten Analysen zeigen signifikantes Wachstum im Bereich digitaler Transformation
- Experten prognostizieren weiterhin eine positive Entwicklung für Cloud-basierte Lösungen
- Aktuelle Technologien verbessern die Effizienz um durchschnittlich 23%

## Führende Unternehmen im Sektor
1. TechInnovators GmbH - Marktanteil 28%
2. FutureWave AG - Marktanteil 22%
3. ProgressTech Ltd. - Marktanteil 17%

## Innovationen und neue Produkte
- Smart-Integration-Lösungen für bestehende Systeme
- KI-gestützte Automatisierungsprozesse
- Verbesserte Nachhaltigkeitsstandards durch neue Materialien

*Hinweis: Dies sind simulierte Daten, da kein echtes Web-Scraping möglich war.*
                """.strip()

                return simulated_data
        except Exception as e:
            logger.error(f"Fehler beim Web-Scraping: {e}")
            error_message = f"Web-Scraping konnte nicht durchgeführt werden: {str(e)}"
            return error_message.strip()  # Ensure no trailing whitespace

    async def close(self):
        """
        Schließt alle offenen Ressourcen.
        """
        # Currently no resources to close
        pass