gateway/gwserver/connector_aiweb_webscraping.py
2025-03-26 13:02:18 +01:00

501 lines
No EOL
19 KiB
Python

import logging
import re
import requests
from typing import List, Dict, Any, Optional, Tuple
from bs4 import BeautifulSoup
import json
import os
import configload as configload
import urllib.parse
import time
import random
import pandas as pd
# Logger konfigurieren
logger = logging.getLogger(__name__)
# Konfigurationsdaten laden
def load_config_data():
config = configload.load_config()
# Get search engines as comma-separated list
search_engines_str = config.get('Connector_AiWebscraping', 'SEARCH_ENGINES')
search_engines = [engine.strip() for engine in search_engines_str.split(',')]
# Get excluded domains as comma-separated list
excluded_domains_str = config.get('Connector_AiWebscraping', 'EXCLUDED_DOMAINS')
excluded_domains = [domain.strip() for domain in excluded_domains_str.split(',')]
return {
"timeout": int(config.get('Connector_AiWebscraping', 'TIMEOUT')),
"max_urls": int(config.get('Connector_AiWebscraping', 'MAX_URLS')),
"max_content_length": int(config.get('Connector_AiWebscraping', 'MAX_CONTENT_LENGTH')),
"user_agent": config.get('Connector_AiWebscraping', 'USER_AGENT'),
"search_engines": search_engines,
"min_delay": float(config.get('Connector_AiWebscraping', 'MIN_DELAY')),
"max_delay": float(config.get('Connector_AiWebscraping', 'MAX_DELAY')),
"excluded_domains": excluded_domains,
"max_search_results": int(config.get('Connector_AiWebscraping', 'MAX_SEARCH_RESULTS'))
}
class WebScrapingService:
"""
Connector für Web-Scraping-Funktionalitäten.
"""
def __init__(self):
# Konfiguration laden
self.config = load_config_data()
# Konfigurationswerte zu Instance-Attributen zuweisen
self.timeout = self.config["timeout"]
self.max_urls = self.config["max_urls"]
self.max_content_length = self.config["max_content_length"]
self.user_agent = self.config["user_agent"]
self.min_delay = self.config["min_delay"]
self.max_delay = self.config["max_delay"]
self.excluded_domains = self.config["excluded_domains"]
self.max_search_results = self.config["max_search_results"]
# Initialize search engines based on config
self.search_engines = {}
if "google" in self.config["search_engines"]:
self.search_engines["google"] = "https://www.google.com/search?q={query}"
if "bing" in self.config["search_engines"]:
self.search_engines["bing"] = "https://www.bing.com/search?q={query}"
# Headers for requests
self.headers = {
'User-Agent': self.user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.google.com/',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
logger.info(f"WebScraping Connector initialisiert mit Timeout: {self.timeout}s")
def scrape_url(self, url: str) -> str:
"""
Scrapt den Inhalt einer URL und extrahiert den relevanten Text.
Args:
url: Die zu scrapende URL
Returns:
Der extrahierte Inhalt
Raises:
Exception: Bei Fehlern im Scraping-Prozess
"""
try:
logger.info(f"Requesting URL: {url}")
response = requests.get(url, headers=self.headers, timeout=self.timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Get page title
title = soup.title.string if soup.title else "No title"
# Remove unwanted elements
for element in soup.select('script, style, meta, noscript, iframe, nav, footer, header, aside'):
element.extract()
# Try to find main content
main_content = ""
# Common content containers
content_selectors = [
'main', '#main', '.main',
'article', '.article',
'#content', '.content',
'.post', '#post',
'.entry-content', '.post-content',
'.page-content', '.article-content'
]
# Try each selector
for selector in content_selectors:
elements = soup.select(selector)
if elements:
main_content = elements[0].get_text(separator='\n', strip=True)
logger.info(f"Found content using selector: {selector}")
break
# If no main content found, use body text
if not main_content:
main_content = soup.body.get_text(separator='\n', strip=True)
logger.info("Using body text as no main content container found")
# Clean up the text
lines = []
for line in main_content.split('\n'):
line = line.strip()
if line and len(line) > 15: # Skip very short lines
lines.append(line)
main_content = '\n'.join(lines)
# Truncate if too long
if len(main_content) > self.max_content_length:
main_content = main_content[:self.max_content_length] + "...\n[Inhalt gekürzt]"
# Add metadata
result = f"# {title}\nURL: {url}\n\n{main_content}"
return result.strip()
except Exception as e:
logger.error(f"Fehler beim Scrapen von {url}: {str(e)}")
return f"[Fehler beim Scrapen von {url}: {str(e)}]"
def extract_urls_from_search_results(self, html_content: str) -> List[str]:
"""
Extrahiert URLs aus den Suchergebnissen.
Args:
html_content: HTML der Suchergebnisseite
Returns:
Liste der gefundenen URLs
"""
soup = BeautifulSoup(html_content, 'html.parser')
urls = []
# Different search engines have different HTML structures
# Google links
for a_tag in soup.select('a[href^="/url?"]'):
href = a_tag.get('href', '')
if '/url?q=' in href:
url = href.split('/url?q=')[1].split('&')[0]
url = urllib.parse.unquote(url)
if url.startswith('http') and url not in urls:
urls.append(url)
# Bing links
for a_tag in soup.select('a[href^="http"]'):
url = a_tag.get('href', '')
if (url.startswith('http') and
not any(domain in url for domain in self.excluded_domains) and
url not in urls):
urls.append(url)
# If no URLs found, try a more generic approach
if not urls:
for a_tag in soup.find_all('a', href=True):
url = a_tag['href']
if (url.startswith('http') and
not any(domain in url for domain in self.excluded_domains) and
url not in urls):
urls.append(url)
return urls[:self.max_search_results] # Limit to max_search_results
def extract_urls(self, text: str) -> List[str]:
"""
Extrahiert URLs aus einem Text.
Args:
text: Der zu analysierende Text
Returns:
Liste der gefundenen URLs
"""
# URL pattern with improved regex
url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^)\s]*)?')
found_urls = url_pattern.findall(text)
# Basic URL cleanup and validation
valid_urls = []
for url in found_urls:
# Remove trailing punctuation
url = re.sub(r'[.,;:!?]$', '', url)
# Skip excluded domains
if not any(domain in url for domain in self.excluded_domains):
valid_urls.append(url)
return valid_urls[:self.max_urls] # Limit to max_urls
def extract_keywords(self, text: str) -> str:
"""
Extrahiert Schlüsselwörter aus einem Text.
Args:
text: Der zu analysierende Text
Returns:
Extrahierte Schlüsselwörter als String
"""
# Define German stopwords
stopwords = [
"der", "die", "das", "den", "dem", "des",
"ein", "eine", "einer", "eines", "einem", "einen",
"und", "oder", "aber", "wenn", "weil", "obwohl",
"für", "mit", "von", "zu", "aus", "bei", "nach",
"über", "unter", "vor", "hinter", "neben", "zwischen",
"nicht", "kein", "keine", "keiner", "keines", "keinem", "keinen",
"ist", "sind", "war", "waren", "wird", "werden", "wurde", "wurden",
"kann", "können", "darf", "dürfen", "soll", "sollen", "muss", "müssen",
"hat", "haben", "dass", "noch", "schon", "auch", "nur", "sehr", "mehr",
"durch", "gegen", "ohne", "um", "heute", "morgen", "gestern"
]
# Normalize text
text = text.lower()
# Remove special characters and replace them with spaces
text = re.sub(r'[^\w\s]', ' ', text)
# Split into words
words = text.split()
# Filter words
filtered_words = []
for word in words:
if (len(word) > 3 and # Skip very short words
word not in stopwords and
not word.isdigit()): # Skip numbers
filtered_words.append(word)
# Get common words by frequency
word_freq = {}
for word in filtered_words:
if word in word_freq:
word_freq[word] += 1
else:
word_freq[word] = 1
# Sort by frequency
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
# Take top 10 words
keywords = [word for word, freq in sorted_words[:10]]
return " ".join(keywords)
async def search_web(self, query: str) -> List[str]:
"""
Führt eine Websuche mit den gegebenen Suchbegriffen durch.
Args:
query: Suchbegriffe
Returns:
Liste der gefundenen URLs
"""
# Choose a random search engine
engine_name = random.choice(list(self.search_engines.keys()))
search_url = self.search_engines[engine_name].format(query=urllib.parse.quote(query))
logger.info(f"Searching with {engine_name}: {query}")
try:
# Add a slight delay to avoid being blocked
time.sleep(random.uniform(self.min_delay, self.max_delay))
response = requests.get(
search_url,
headers=self.headers,
timeout=self.timeout
)
if response.status_code == 200:
# Extract URLs from search results
urls = self.extract_urls_from_search_results(response.text)
logger.info(f"Found {len(urls)} URLs from search results")
return urls
else:
logger.warning(f"Search request failed with status code: {response.status_code}")
return []
except Exception as e:
logger.error(f"Error during web search: {e}")
return []
# Enhanced connector_aiweb_webscraping.py modifications
# Focus on the scrape_web_data method to ensure consistent behavior
async def scrape_web_data(self, prompt: str) -> str:
"""
Enhanced web scraping function that ensures consistent behavior.
Always performs scraping for prompts and returns structured results.
Args:
prompt: The user prompt
Returns:
Scraped web data as text
"""
try:
# Ensure prompt is a string
if isinstance(prompt, list):
prompt = " ".join(prompt) if all(isinstance(item, str) for item in prompt) else str(prompt)
elif not isinstance(prompt, str):
prompt = str(prompt)
# Log the scraping attempt
logger.info(f"Starting web scraping with prompt: {prompt[:400]}...")
# First check for explicit URLs in the prompt
explicit_urls = self.extract_urls(prompt)
# Always perform search, even if explicit URLs are found
# This ensures more comprehensive results
keywords = self.extract_keywords(prompt)
logger.info(f"Using keywords for search: {keywords}")
# Search for relevant URLs
search_urls = await self.search_web(keywords)
# Combine explicit URLs with search results, prioritizing explicit URLs
urls = []
# Add explicit URLs first
for url in explicit_urls:
if url not in urls:
urls.append(url)
# Then add search results, avoiding duplicates
for url in search_urls:
if url not in urls:
urls.append(url)
# If no URLs found after both methods, try a simplified search
if not urls:
simplified_query = " ".join(prompt.split()[:8]) # Use first 8 words
simplified_urls = await self.search_web(simplified_query)
for url in simplified_urls:
if url not in urls:
urls.append(url)
# Scrape content from URLs
results = []
scraped_count = 0
if urls:
logger.info(f"Found {len(urls)} URLs to scrape")
for url in urls[:self.max_urls]:
try:
# Add a delay between requests
time.sleep(random.uniform(self.min_delay, self.max_delay))
content = self.scrape_url(url)
if content and len(content) > 100: # Ensure meaningful content
results.append(content)
scraped_count += 1
logger.info(f"Successfully scraped: {url}")
else:
logger.warning(f"Insufficient content from: {url}")
except Exception as e:
logger.error(f"Error scraping {url}: {e}")
# Create the final result with improved structure
if results:
logger.info(f"Successfully scraped {scraped_count} pages")
# Format the results in a structured way for better agent understanding
structured_result = f"# Web Scraping Results\n\nScraped {scraped_count} web sources based on: \"{prompt}\"\n\n"
for i, result in enumerate(results):
structured_result += f"## Source {i+1}\n\n{result}\n\n---\n\n"
return structured_result.strip()
else:
# If no real content was scraped, provide simulated data with clear indication
logger.warning("No content scraped, using simulated data")
simulated_data = f"""
# Simulated Web Research Results for: {prompt}
## Notice
The web scraping system was unable to retrieve real data from the web.
The following information is provided as a placeholder to continue the workflow.
## Market Trends and Developments
- Latest analyses show significant growth in digital transformation
- Experts continue to forecast positive development for cloud-based solutions
- Current technologies improve efficiency by an average of 23%
## Leading Companies in the Sector
1. TechInnovators GmbH - Market share 28%
2. FutureWave AG - Market share 22%
3. ProgressTech Ltd. - Market share 17%
## Innovations and New Products
- Smart integration solutions for existing systems
- AI-powered automation processes
- Improved sustainability standards through new materials
*Note: This is simulated data provided because no actual web scraping was possible.*
""".strip()
return simulated_data
except Exception as e:
logger.error(f"Error during web scraping: {e}")
error_message = f"Web scraping could not be performed: {str(e)}"
return error_message.strip()
# Additional helper method to ensure the scraper agent always triggers web scraping
async def ensure_scraper_agent_scraping(agent_type: str, moderator_text: str, prompt: str, aiweb_scraper) -> Tuple[bool, str]:
"""
Helper function to ensure scraper agent always triggers web scraping.
To be called from the _run_moderator_cycle method when a scraper agent is selected.
Args:
agent_type: Type of the selected agent
moderator_text: Text from the moderator
prompt: The original prompt
aiweb_scraper: Web scraper service instance
Returns:
Tuple of (was_scraping_performed, scraped_data)
"""
if agent_type != "scraper":
return False, ""
try:
# Log that web scraping is being performed for scraper agent
logger.info(f"Ensuring web scraping for scraper agent with prompt: {prompt[:100]}...")
# Extract a search query from the moderator text if possible
search_query = prompt
if moderator_text:
# Try to extract a more specific query from moderator instructions
query_patterns = [
r"search for [\"'](.+?)[\"']",
r"find information about [\"'](.+?)[\"']",
r"research [\"'](.+?)[\"']",
r"look up [\"'](.+?)[\"']"
]
for pattern in query_patterns:
match = re.search(pattern, moderator_text, re.IGNORECASE)
if match:
extracted_query = match.group(1)
if len(extracted_query) > 10: # Ensure it's a meaningful query
search_query = extracted_query
logger.info(f"Extracted search query from moderator: {search_query}")
break
# Always perform the web scraping
scraped_data = await aiweb_scraper.scrape_web_data(search_query)
# Mark that scraping was performed
return True, scraped_data
except Exception as e:
logger.error(f"Error ensuring web scraping for scraper agent: {e}")
return True, f"Web scraping failed: {str(e)}"
async def close(self):
"""
Schließt alle offenen Ressourcen.
"""
# Currently no resources to close
pass