gateway/gwserver/connector_aiweb_webscraping.py

import logging
import re
import requests
from typing import List, Dict, Any, Optional, Tuple
from bs4 import BeautifulSoup
import json
import os
import configload as configload
import urllib.parse
import time
import random
import pandas as pd

# Logger konfigurieren
logger = logging.getLogger(__name__)

# Konfigurationsdaten laden
def load_config_data():
    config = configload.load_config()

    # Get search engines as comma-separated list
    search_engines_str = config.get('Connector_AiWebscraping', 'SEARCH_ENGINES')
    search_engines = [engine.strip() for engine in search_engines_str.split(',')]

    # Get excluded domains as comma-separated list
    excluded_domains_str = config.get('Connector_AiWebscraping', 'EXCLUDED_DOMAINS')
    excluded_domains = [domain.strip() for domain in excluded_domains_str.split(',')]

    return {
        "timeout": int(config.get('Connector_AiWebscraping', 'TIMEOUT')),
        "max_urls": int(config.get('Connector_AiWebscraping', 'MAX_URLS')),
        "max_content_length": int(config.get('Connector_AiWebscraping', 'MAX_CONTENT_LENGTH')),
        "user_agent": config.get('Connector_AiWebscraping', 'USER_AGENT'),
        "search_engines": search_engines,
        "min_delay": float(config.get('Connector_AiWebscraping', 'MIN_DELAY')),
        "max_delay": float(config.get('Connector_AiWebscraping', 'MAX_DELAY')),
        "excluded_domains": excluded_domains,
        "max_search_results": int(config.get('Connector_AiWebscraping', 'MAX_SEARCH_RESULTS'))
    }

class WebScrapingService:
    """
    Connector für Web-Scraping-Funktionalitäten.
    """

    def __init__(self):
        # Konfiguration laden
        self.config = load_config_data()

        # Konfigurationswerte zu Instance-Attributen zuweisen
        self.timeout = self.config["timeout"]
        self.max_urls = self.config["max_urls"]
        self.max_content_length = self.config["max_content_length"]
        self.user_agent = self.config["user_agent"]
        self.min_delay = self.config["min_delay"]
        self.max_delay = self.config["max_delay"]
        self.excluded_domains = self.config["excluded_domains"]
        self.max_search_results = self.config["max_search_results"]

        # Initialize search engines based on config
        self.search_engines = {}
        if "google" in self.config["search_engines"]:
            self.search_engines["google"] = "https://www.google.com/search?q={query}"
        if "bing" in self.config["search_engines"]:
            self.search_engines["bing"] = "https://www.bing.com/search?q={query}"

        # Headers for requests
        self.headers = {
            'User-Agent': self.user_agent,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.google.com/',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }

        logger.info(f"WebScraping Connector initialisiert mit Timeout: {self.timeout}s")

    def scrape_url(self, url: str) -> str:
        """
        Scrapt den Inhalt einer URL und extrahiert den relevanten Text.

        Args:
            url: Die zu scrapende URL

        Returns:
            Der extrahierte Inhalt

        Raises:
            Exception: Bei Fehlern im Scraping-Prozess
        """
        try:
            logger.info(f"Requesting URL: {url}")
            response = requests.get(url, headers=self.headers, timeout=self.timeout)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Get page title
            title = soup.title.string if soup.title else "No title"

            # Remove unwanted elements
            for element in soup.select('script, style, meta, noscript, iframe, nav, footer, header, aside'):
                element.extract()

            # Try to find main content
            main_content = ""

            # Common content containers
            content_selectors = [
                'main', '#main', '.main',
                'article', '.article',
                '#content', '.content',
                '.post', '#post',
                '.entry-content', '.post-content',
                '.page-content', '.article-content'
            ]

            # Try each selector
            for selector in content_selectors:
                elements = soup.select(selector)
                if elements:
                    main_content = elements[0].get_text(separator='\n', strip=True)
                    logger.info(f"Found content using selector: {selector}")
                    break

            # If no main content found, use body text
            if not main_content:
                main_content = soup.body.get_text(separator='\n', strip=True)
                logger.info("Using body text as no main content container found")

            # Clean up the text
            lines = []
            for line in main_content.split('\n'):
                line = line.strip()
                if line and len(line) > 15:  # Skip very short lines
                    lines.append(line)

            main_content = '\n'.join(lines)

            # Truncate if too long
            if len(main_content) > self.max_content_length:
                main_content = main_content[:self.max_content_length] + "...\n[Inhalt gekürzt]"

            # Add metadata
            result = f"# {title}\nURL: {url}\n\n{main_content}"

            return result.strip()

        except Exception as e:
            logger.error(f"Fehler beim Scrapen von {url}: {str(e)}")
            return f"[Fehler beim Scrapen von {url}: {str(e)}]"

    def extract_urls_from_search_results(self, html_content: str) -> List[str]:
        """
        Extrahiert URLs aus den Suchergebnissen.

        Args:
            html_content: HTML der Suchergebnisseite

        Returns:
            Liste der gefundenen URLs
        """
        soup = BeautifulSoup(html_content, 'html.parser')
        urls = []

        # Different search engines have different HTML structures
        # Google links
        for a_tag in soup.select('a[href^="/url?"]'):
            href = a_tag.get('href', '')
            if '/url?q=' in href:
                url = href.split('/url?q=')[1].split('&')[0]
                url = urllib.parse.unquote(url)
                if url.startswith('http') and url not in urls:
                    urls.append(url)

        # Bing links
        for a_tag in soup.select('a[href^="http"]'):
            url = a_tag.get('href', '')
            if (url.startswith('http') and
                not any(domain in url for domain in self.excluded_domains) and
                url not in urls):
                urls.append(url)

        # If no URLs found, try a more generic approach
        if not urls:
            for a_tag in soup.find_all('a', href=True):
                url = a_tag['href']
                if (url.startswith('http') and
                    not any(domain in url for domain in self.excluded_domains) and
                    url not in urls):
                    urls.append(url)

        return urls[:self.max_search_results]  # Limit to max_search_results

    def extract_urls(self, text: str) -> List[str]:
        """
        Extrahiert URLs aus einem Text.

        Args:
            text: Der zu analysierende Text

        Returns:
            Liste der gefundenen URLs
        """
        # URL pattern with improved regex
        url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^)\s]*)?')
        found_urls = url_pattern.findall(text)

        # Basic URL cleanup and validation
        valid_urls = []
        for url in found_urls:
            # Remove trailing punctuation
            url = re.sub(r'[.,;:!?]$', '', url)

            # Skip excluded domains
            if not any(domain in url for domain in self.excluded_domains):
                valid_urls.append(url)

        return valid_urls[:self.max_urls]  # Limit to max_urls

    def extract_keywords(self, text: str) -> str:
        """
        Extrahiert Schlüsselwörter aus einem Text.

        Args:
            text: Der zu analysierende Text

        Returns:
            Extrahierte Schlüsselwörter als String
        """
        # Define German stopwords
        stopwords = [
            "der", "die", "das", "den", "dem", "des",
            "ein", "eine", "einer", "eines", "einem", "einen",
            "und", "oder", "aber", "wenn", "weil", "obwohl",
            "für", "mit", "von", "zu", "aus", "bei", "nach",
            "über", "unter", "vor", "hinter", "neben", "zwischen",
            "nicht", "kein", "keine", "keiner", "keines", "keinem", "keinen",
            "ist", "sind", "war", "waren", "wird", "werden", "wurde", "wurden",
            "kann", "können", "darf", "dürfen", "soll", "sollen", "muss", "müssen",
            "hat", "haben", "dass", "noch", "schon", "auch", "nur", "sehr", "mehr",
            "durch", "gegen", "ohne", "um", "heute", "morgen", "gestern"
        ]

        # Normalize text
        text = text.lower()

        # Remove special characters and replace them with spaces
        text = re.sub(r'[^\w\s]', ' ', text)

        # Split into words
        words = text.split()

        # Filter words
        filtered_words = []
        for word in words:
            if (len(word) > 3 and  # Skip very short words
                word not in stopwords and
                not word.isdigit()):  # Skip numbers
                filtered_words.append(word)

        # Get common words by frequency
        word_freq = {}
        for word in filtered_words:
            if word in word_freq:
                word_freq[word] += 1
            else:
                word_freq[word] = 1

        # Sort by frequency
        sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

        # Take top 10 words
        keywords = [word for word, freq in sorted_words[:10]]

        return " ".join(keywords)

    async def search_web(self, query: str) -> List[str]:
        """
        Führt eine Websuche mit den gegebenen Suchbegriffen durch.

        Args:
            query: Suchbegriffe

        Returns:
            Liste der gefundenen URLs
        """
        # Choose a random search engine
        engine_name = random.choice(list(self.search_engines.keys()))
        search_url = self.search_engines[engine_name].format(query=urllib.parse.quote(query))

        logger.info(f"Searching with {engine_name}: {query}")

        try:
            # Add a slight delay to avoid being blocked
            time.sleep(random.uniform(self.min_delay, self.max_delay))

            response = requests.get(
                search_url,
                headers=self.headers,
                timeout=self.timeout
            )

            if response.status_code == 200:
                # Extract URLs from search results
                urls = self.extract_urls_from_search_results(response.text)
                logger.info(f"Found {len(urls)} URLs from search results")
                return urls
            else:
                logger.warning(f"Search request failed with status code: {response.status_code}")
                return []
        except Exception as e:
            logger.error(f"Error during web search: {e}")
            return []


    # Enhanced connector_aiweb_webscraping.py modifications
    # Focus on the scrape_web_data method to ensure consistent behavior

    async def scrape_web_data(self, prompt: str) -> str:
        """
        Enhanced web scraping function that ensures consistent behavior.
        Always performs scraping for prompts and returns structured results.

        Args:
            prompt: The user prompt

        Returns:
            Scraped web data as text
        """
        try:
            # Ensure prompt is a string
            if isinstance(prompt, list):
                prompt = " ".join(prompt) if all(isinstance(item, str) for item in prompt) else str(prompt)
            elif not isinstance(prompt, str):
                prompt = str(prompt)

            # Log the scraping attempt
            logger.info(f"Starting web scraping with prompt: {prompt[:400]}...")

            # First check for explicit URLs in the prompt
            explicit_urls = self.extract_urls(prompt)

            # Always perform search, even if explicit URLs are found
            # This ensures more comprehensive results
            keywords = self.extract_keywords(prompt)
            logger.info(f"Using keywords for search: {keywords}")

            # Search for relevant URLs
            search_urls = await self.search_web(keywords)

            # Combine explicit URLs with search results, prioritizing explicit URLs
            urls = []
            # Add explicit URLs first
            for url in explicit_urls:
                if url not in urls:
                    urls.append(url)

            # Then add search results, avoiding duplicates
            for url in search_urls:
                if url not in urls:
                    urls.append(url)

            # If no URLs found after both methods, try a simplified search
            if not urls:
                simplified_query = " ".join(prompt.split()[:8])  # Use first 8 words
                simplified_urls = await self.search_web(simplified_query)
                for url in simplified_urls:
                    if url not in urls:
                        urls.append(url)

            # Scrape content from URLs
            results = []
            scraped_count = 0

            if urls:
                logger.info(f"Found {len(urls)} URLs to scrape")

                for url in urls[:self.max_urls]:
                    try:
                        # Add a delay between requests
                        time.sleep(random.uniform(self.min_delay, self.max_delay))

                        content = self.scrape_url(url)
                        if content and len(content) > 100:  # Ensure meaningful content
                            results.append(content)
                            scraped_count += 1
                            logger.info(f"Successfully scraped: {url}")
                        else:
                            logger.warning(f"Insufficient content from: {url}")
                    except Exception as e:
                        logger.error(f"Error scraping {url}: {e}")

            # Create the final result with improved structure
            if results:
                logger.info(f"Successfully scraped {scraped_count} pages")

                # Format the results in a structured way for better agent understanding
                structured_result = f"# Web Scraping Results\n\nScraped {scraped_count} web sources based on: \"{prompt}\"\n\n"

                for i, result in enumerate(results):
                    structured_result += f"## Source {i+1}\n\n{result}\n\n---\n\n"

                return structured_result.strip()
            else:
                # If no real content was scraped, provide simulated data with clear indication
                logger.warning("No content scraped, using simulated data")

                simulated_data = f"""
    # Simulated Web Research Results for: {prompt}

    ## Notice
    The web scraping system was unable to retrieve real data from the web.
    The following information is provided as a placeholder to continue the workflow.

    ## Market Trends and Developments
    - Latest analyses show significant growth in digital transformation
    - Experts continue to forecast positive development for cloud-based solutions
    - Current technologies improve efficiency by an average of 23%

    ## Leading Companies in the Sector
    1. TechInnovators GmbH - Market share 28%
    2. FutureWave AG - Market share 22%
    3. ProgressTech Ltd. - Market share 17%

    ## Innovations and New Products
    - Smart integration solutions for existing systems
    - AI-powered automation processes
    - Improved sustainability standards through new materials

    *Note: This is simulated data provided because no actual web scraping was possible.*
                """.strip()

                return simulated_data
        except Exception as e:
            logger.error(f"Error during web scraping: {e}")
            error_message = f"Web scraping could not be performed: {str(e)}"
            return error_message.strip()

    # Additional helper method to ensure the scraper agent always triggers web scraping
    async def ensure_scraper_agent_scraping(agent_type: str, moderator_text: str, prompt: str, aiweb_scraper) -> Tuple[bool, str]:
        """
        Helper function to ensure scraper agent always triggers web scraping.
        To be called from the _run_moderator_cycle method when a scraper agent is selected.

        Args:
            agent_type: Type of the selected agent
            moderator_text: Text from the moderator
            prompt: The original prompt
            aiweb_scraper: Web scraper service instance

        Returns:
            Tuple of (was_scraping_performed, scraped_data)
        """
        if agent_type != "scraper":
            return False, ""

        try:
            # Log that web scraping is being performed for scraper agent
            logger.info(f"Ensuring web scraping for scraper agent with prompt: {prompt[:100]}...")

            # Extract a search query from the moderator text if possible
            search_query = prompt
            if moderator_text:
                # Try to extract a more specific query from moderator instructions
                query_patterns = [
                    r"search for [\"'](.+?)[\"']",
                    r"find information about [\"'](.+?)[\"']",
                    r"research [\"'](.+?)[\"']",
                    r"look up [\"'](.+?)[\"']"
                ]

                for pattern in query_patterns:
                    match = re.search(pattern, moderator_text, re.IGNORECASE)
                    if match:
                        extracted_query = match.group(1)
                        if len(extracted_query) > 10:  # Ensure it's a meaningful query
                            search_query = extracted_query
                            logger.info(f"Extracted search query from moderator: {search_query}")
                            break

            # Always perform the web scraping
            scraped_data = await aiweb_scraper.scrape_web_data(search_query)

            # Mark that scraping was performed
            return True, scraped_data

        except Exception as e:
            logger.error(f"Error ensuring web scraping for scraper agent: {e}")
            return True, f"Web scraping failed: {str(e)}"


    async def close(self):
        """
        Schließt alle offenen Ressourcen.
        """
        # Currently no resources to close
        pass