501 lines
No EOL
19 KiB
Python
501 lines
No EOL
19 KiB
Python
import logging
|
|
import re
|
|
import requests
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import os
|
|
import configload as configload
|
|
import urllib.parse
|
|
import time
|
|
import random
|
|
import pandas as pd
|
|
|
|
# Logger konfigurieren
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Konfigurationsdaten laden
|
|
def load_config_data():
|
|
config = configload.load_config()
|
|
|
|
# Get search engines as comma-separated list
|
|
search_engines_str = config.get('Connector_AiWebscraping', 'SEARCH_ENGINES')
|
|
search_engines = [engine.strip() for engine in search_engines_str.split(',')]
|
|
|
|
# Get excluded domains as comma-separated list
|
|
excluded_domains_str = config.get('Connector_AiWebscraping', 'EXCLUDED_DOMAINS')
|
|
excluded_domains = [domain.strip() for domain in excluded_domains_str.split(',')]
|
|
|
|
return {
|
|
"timeout": int(config.get('Connector_AiWebscraping', 'TIMEOUT')),
|
|
"max_urls": int(config.get('Connector_AiWebscraping', 'MAX_URLS')),
|
|
"max_content_length": int(config.get('Connector_AiWebscraping', 'MAX_CONTENT_LENGTH')),
|
|
"user_agent": config.get('Connector_AiWebscraping', 'USER_AGENT'),
|
|
"search_engines": search_engines,
|
|
"min_delay": float(config.get('Connector_AiWebscraping', 'MIN_DELAY')),
|
|
"max_delay": float(config.get('Connector_AiWebscraping', 'MAX_DELAY')),
|
|
"excluded_domains": excluded_domains,
|
|
"max_search_results": int(config.get('Connector_AiWebscraping', 'MAX_SEARCH_RESULTS'))
|
|
}
|
|
|
|
class WebScrapingService:
|
|
"""
|
|
Connector für Web-Scraping-Funktionalitäten.
|
|
"""
|
|
|
|
def __init__(self):
|
|
# Konfiguration laden
|
|
self.config = load_config_data()
|
|
|
|
# Konfigurationswerte zu Instance-Attributen zuweisen
|
|
self.timeout = self.config["timeout"]
|
|
self.max_urls = self.config["max_urls"]
|
|
self.max_content_length = self.config["max_content_length"]
|
|
self.user_agent = self.config["user_agent"]
|
|
self.min_delay = self.config["min_delay"]
|
|
self.max_delay = self.config["max_delay"]
|
|
self.excluded_domains = self.config["excluded_domains"]
|
|
self.max_search_results = self.config["max_search_results"]
|
|
|
|
# Initialize search engines based on config
|
|
self.search_engines = {}
|
|
if "google" in self.config["search_engines"]:
|
|
self.search_engines["google"] = "https://www.google.com/search?q={query}"
|
|
if "bing" in self.config["search_engines"]:
|
|
self.search_engines["bing"] = "https://www.bing.com/search?q={query}"
|
|
|
|
# Headers for requests
|
|
self.headers = {
|
|
'User-Agent': self.user_agent,
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Referer': 'https://www.google.com/',
|
|
'DNT': '1',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
}
|
|
|
|
logger.info(f"WebScraping Connector initialisiert mit Timeout: {self.timeout}s")
|
|
|
|
def scrape_url(self, url: str) -> str:
|
|
"""
|
|
Scrapt den Inhalt einer URL und extrahiert den relevanten Text.
|
|
|
|
Args:
|
|
url: Die zu scrapende URL
|
|
|
|
Returns:
|
|
Der extrahierte Inhalt
|
|
|
|
Raises:
|
|
Exception: Bei Fehlern im Scraping-Prozess
|
|
"""
|
|
try:
|
|
logger.info(f"Requesting URL: {url}")
|
|
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Get page title
|
|
title = soup.title.string if soup.title else "No title"
|
|
|
|
# Remove unwanted elements
|
|
for element in soup.select('script, style, meta, noscript, iframe, nav, footer, header, aside'):
|
|
element.extract()
|
|
|
|
# Try to find main content
|
|
main_content = ""
|
|
|
|
# Common content containers
|
|
content_selectors = [
|
|
'main', '#main', '.main',
|
|
'article', '.article',
|
|
'#content', '.content',
|
|
'.post', '#post',
|
|
'.entry-content', '.post-content',
|
|
'.page-content', '.article-content'
|
|
]
|
|
|
|
# Try each selector
|
|
for selector in content_selectors:
|
|
elements = soup.select(selector)
|
|
if elements:
|
|
main_content = elements[0].get_text(separator='\n', strip=True)
|
|
logger.info(f"Found content using selector: {selector}")
|
|
break
|
|
|
|
# If no main content found, use body text
|
|
if not main_content:
|
|
main_content = soup.body.get_text(separator='\n', strip=True)
|
|
logger.info("Using body text as no main content container found")
|
|
|
|
# Clean up the text
|
|
lines = []
|
|
for line in main_content.split('\n'):
|
|
line = line.strip()
|
|
if line and len(line) > 15: # Skip very short lines
|
|
lines.append(line)
|
|
|
|
main_content = '\n'.join(lines)
|
|
|
|
# Truncate if too long
|
|
if len(main_content) > self.max_content_length:
|
|
main_content = main_content[:self.max_content_length] + "...\n[Inhalt gekürzt]"
|
|
|
|
# Add metadata
|
|
result = f"# {title}\nURL: {url}\n\n{main_content}"
|
|
|
|
return result.strip()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Fehler beim Scrapen von {url}: {str(e)}")
|
|
return f"[Fehler beim Scrapen von {url}: {str(e)}]"
|
|
|
|
def extract_urls_from_search_results(self, html_content: str) -> List[str]:
|
|
"""
|
|
Extrahiert URLs aus den Suchergebnissen.
|
|
|
|
Args:
|
|
html_content: HTML der Suchergebnisseite
|
|
|
|
Returns:
|
|
Liste der gefundenen URLs
|
|
"""
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
urls = []
|
|
|
|
# Different search engines have different HTML structures
|
|
# Google links
|
|
for a_tag in soup.select('a[href^="/url?"]'):
|
|
href = a_tag.get('href', '')
|
|
if '/url?q=' in href:
|
|
url = href.split('/url?q=')[1].split('&')[0]
|
|
url = urllib.parse.unquote(url)
|
|
if url.startswith('http') and url not in urls:
|
|
urls.append(url)
|
|
|
|
# Bing links
|
|
for a_tag in soup.select('a[href^="http"]'):
|
|
url = a_tag.get('href', '')
|
|
if (url.startswith('http') and
|
|
not any(domain in url for domain in self.excluded_domains) and
|
|
url not in urls):
|
|
urls.append(url)
|
|
|
|
# If no URLs found, try a more generic approach
|
|
if not urls:
|
|
for a_tag in soup.find_all('a', href=True):
|
|
url = a_tag['href']
|
|
if (url.startswith('http') and
|
|
not any(domain in url for domain in self.excluded_domains) and
|
|
url not in urls):
|
|
urls.append(url)
|
|
|
|
return urls[:self.max_search_results] # Limit to max_search_results
|
|
|
|
def extract_urls(self, text: str) -> List[str]:
|
|
"""
|
|
Extrahiert URLs aus einem Text.
|
|
|
|
Args:
|
|
text: Der zu analysierende Text
|
|
|
|
Returns:
|
|
Liste der gefundenen URLs
|
|
"""
|
|
# URL pattern with improved regex
|
|
url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^)\s]*)?')
|
|
found_urls = url_pattern.findall(text)
|
|
|
|
# Basic URL cleanup and validation
|
|
valid_urls = []
|
|
for url in found_urls:
|
|
# Remove trailing punctuation
|
|
url = re.sub(r'[.,;:!?]$', '', url)
|
|
|
|
# Skip excluded domains
|
|
if not any(domain in url for domain in self.excluded_domains):
|
|
valid_urls.append(url)
|
|
|
|
return valid_urls[:self.max_urls] # Limit to max_urls
|
|
|
|
def extract_keywords(self, text: str) -> str:
|
|
"""
|
|
Extrahiert Schlüsselwörter aus einem Text.
|
|
|
|
Args:
|
|
text: Der zu analysierende Text
|
|
|
|
Returns:
|
|
Extrahierte Schlüsselwörter als String
|
|
"""
|
|
# Define German stopwords
|
|
stopwords = [
|
|
"der", "die", "das", "den", "dem", "des",
|
|
"ein", "eine", "einer", "eines", "einem", "einen",
|
|
"und", "oder", "aber", "wenn", "weil", "obwohl",
|
|
"für", "mit", "von", "zu", "aus", "bei", "nach",
|
|
"über", "unter", "vor", "hinter", "neben", "zwischen",
|
|
"nicht", "kein", "keine", "keiner", "keines", "keinem", "keinen",
|
|
"ist", "sind", "war", "waren", "wird", "werden", "wurde", "wurden",
|
|
"kann", "können", "darf", "dürfen", "soll", "sollen", "muss", "müssen",
|
|
"hat", "haben", "dass", "noch", "schon", "auch", "nur", "sehr", "mehr",
|
|
"durch", "gegen", "ohne", "um", "heute", "morgen", "gestern"
|
|
]
|
|
|
|
# Normalize text
|
|
text = text.lower()
|
|
|
|
# Remove special characters and replace them with spaces
|
|
text = re.sub(r'[^\w\s]', ' ', text)
|
|
|
|
# Split into words
|
|
words = text.split()
|
|
|
|
# Filter words
|
|
filtered_words = []
|
|
for word in words:
|
|
if (len(word) > 3 and # Skip very short words
|
|
word not in stopwords and
|
|
not word.isdigit()): # Skip numbers
|
|
filtered_words.append(word)
|
|
|
|
# Get common words by frequency
|
|
word_freq = {}
|
|
for word in filtered_words:
|
|
if word in word_freq:
|
|
word_freq[word] += 1
|
|
else:
|
|
word_freq[word] = 1
|
|
|
|
# Sort by frequency
|
|
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
# Take top 10 words
|
|
keywords = [word for word, freq in sorted_words[:10]]
|
|
|
|
return " ".join(keywords)
|
|
|
|
async def search_web(self, query: str) -> List[str]:
|
|
"""
|
|
Führt eine Websuche mit den gegebenen Suchbegriffen durch.
|
|
|
|
Args:
|
|
query: Suchbegriffe
|
|
|
|
Returns:
|
|
Liste der gefundenen URLs
|
|
"""
|
|
# Choose a random search engine
|
|
engine_name = random.choice(list(self.search_engines.keys()))
|
|
search_url = self.search_engines[engine_name].format(query=urllib.parse.quote(query))
|
|
|
|
logger.info(f"Searching with {engine_name}: {query}")
|
|
|
|
try:
|
|
# Add a slight delay to avoid being blocked
|
|
time.sleep(random.uniform(self.min_delay, self.max_delay))
|
|
|
|
response = requests.get(
|
|
search_url,
|
|
headers=self.headers,
|
|
timeout=self.timeout
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
# Extract URLs from search results
|
|
urls = self.extract_urls_from_search_results(response.text)
|
|
logger.info(f"Found {len(urls)} URLs from search results")
|
|
return urls
|
|
else:
|
|
logger.warning(f"Search request failed with status code: {response.status_code}")
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"Error during web search: {e}")
|
|
return []
|
|
|
|
|
|
|
|
# Enhanced connector_aiweb_webscraping.py modifications
|
|
# Focus on the scrape_web_data method to ensure consistent behavior
|
|
|
|
async def scrape_web_data(self, prompt: str) -> str:
|
|
"""
|
|
Enhanced web scraping function that ensures consistent behavior.
|
|
Always performs scraping for prompts and returns structured results.
|
|
|
|
Args:
|
|
prompt: The user prompt
|
|
|
|
Returns:
|
|
Scraped web data as text
|
|
"""
|
|
try:
|
|
# Ensure prompt is a string
|
|
if isinstance(prompt, list):
|
|
prompt = " ".join(prompt) if all(isinstance(item, str) for item in prompt) else str(prompt)
|
|
elif not isinstance(prompt, str):
|
|
prompt = str(prompt)
|
|
|
|
# Log the scraping attempt
|
|
logger.info(f"Starting web scraping with prompt: {prompt[:400]}...")
|
|
|
|
# First check for explicit URLs in the prompt
|
|
explicit_urls = self.extract_urls(prompt)
|
|
|
|
# Always perform search, even if explicit URLs are found
|
|
# This ensures more comprehensive results
|
|
keywords = self.extract_keywords(prompt)
|
|
logger.info(f"Using keywords for search: {keywords}")
|
|
|
|
# Search for relevant URLs
|
|
search_urls = await self.search_web(keywords)
|
|
|
|
# Combine explicit URLs with search results, prioritizing explicit URLs
|
|
urls = []
|
|
# Add explicit URLs first
|
|
for url in explicit_urls:
|
|
if url not in urls:
|
|
urls.append(url)
|
|
|
|
# Then add search results, avoiding duplicates
|
|
for url in search_urls:
|
|
if url not in urls:
|
|
urls.append(url)
|
|
|
|
# If no URLs found after both methods, try a simplified search
|
|
if not urls:
|
|
simplified_query = " ".join(prompt.split()[:8]) # Use first 8 words
|
|
simplified_urls = await self.search_web(simplified_query)
|
|
for url in simplified_urls:
|
|
if url not in urls:
|
|
urls.append(url)
|
|
|
|
# Scrape content from URLs
|
|
results = []
|
|
scraped_count = 0
|
|
|
|
if urls:
|
|
logger.info(f"Found {len(urls)} URLs to scrape")
|
|
|
|
for url in urls[:self.max_urls]:
|
|
try:
|
|
# Add a delay between requests
|
|
time.sleep(random.uniform(self.min_delay, self.max_delay))
|
|
|
|
content = self.scrape_url(url)
|
|
if content and len(content) > 100: # Ensure meaningful content
|
|
results.append(content)
|
|
scraped_count += 1
|
|
logger.info(f"Successfully scraped: {url}")
|
|
else:
|
|
logger.warning(f"Insufficient content from: {url}")
|
|
except Exception as e:
|
|
logger.error(f"Error scraping {url}: {e}")
|
|
|
|
# Create the final result with improved structure
|
|
if results:
|
|
logger.info(f"Successfully scraped {scraped_count} pages")
|
|
|
|
# Format the results in a structured way for better agent understanding
|
|
structured_result = f"# Web Scraping Results\n\nScraped {scraped_count} web sources based on: \"{prompt}\"\n\n"
|
|
|
|
for i, result in enumerate(results):
|
|
structured_result += f"## Source {i+1}\n\n{result}\n\n---\n\n"
|
|
|
|
return structured_result.strip()
|
|
else:
|
|
# If no real content was scraped, provide simulated data with clear indication
|
|
logger.warning("No content scraped, using simulated data")
|
|
|
|
simulated_data = f"""
|
|
# Simulated Web Research Results for: {prompt}
|
|
|
|
## Notice
|
|
The web scraping system was unable to retrieve real data from the web.
|
|
The following information is provided as a placeholder to continue the workflow.
|
|
|
|
## Market Trends and Developments
|
|
- Latest analyses show significant growth in digital transformation
|
|
- Experts continue to forecast positive development for cloud-based solutions
|
|
- Current technologies improve efficiency by an average of 23%
|
|
|
|
## Leading Companies in the Sector
|
|
1. TechInnovators GmbH - Market share 28%
|
|
2. FutureWave AG - Market share 22%
|
|
3. ProgressTech Ltd. - Market share 17%
|
|
|
|
## Innovations and New Products
|
|
- Smart integration solutions for existing systems
|
|
- AI-powered automation processes
|
|
- Improved sustainability standards through new materials
|
|
|
|
*Note: This is simulated data provided because no actual web scraping was possible.*
|
|
""".strip()
|
|
|
|
return simulated_data
|
|
except Exception as e:
|
|
logger.error(f"Error during web scraping: {e}")
|
|
error_message = f"Web scraping could not be performed: {str(e)}"
|
|
return error_message.strip()
|
|
|
|
# Additional helper method to ensure the scraper agent always triggers web scraping
|
|
async def ensure_scraper_agent_scraping(agent_type: str, moderator_text: str, prompt: str, aiweb_scraper) -> Tuple[bool, str]:
|
|
"""
|
|
Helper function to ensure scraper agent always triggers web scraping.
|
|
To be called from the _run_moderator_cycle method when a scraper agent is selected.
|
|
|
|
Args:
|
|
agent_type: Type of the selected agent
|
|
moderator_text: Text from the moderator
|
|
prompt: The original prompt
|
|
aiweb_scraper: Web scraper service instance
|
|
|
|
Returns:
|
|
Tuple of (was_scraping_performed, scraped_data)
|
|
"""
|
|
if agent_type != "scraper":
|
|
return False, ""
|
|
|
|
try:
|
|
# Log that web scraping is being performed for scraper agent
|
|
logger.info(f"Ensuring web scraping for scraper agent with prompt: {prompt[:100]}...")
|
|
|
|
# Extract a search query from the moderator text if possible
|
|
search_query = prompt
|
|
if moderator_text:
|
|
# Try to extract a more specific query from moderator instructions
|
|
query_patterns = [
|
|
r"search for [\"'](.+?)[\"']",
|
|
r"find information about [\"'](.+?)[\"']",
|
|
r"research [\"'](.+?)[\"']",
|
|
r"look up [\"'](.+?)[\"']"
|
|
]
|
|
|
|
for pattern in query_patterns:
|
|
match = re.search(pattern, moderator_text, re.IGNORECASE)
|
|
if match:
|
|
extracted_query = match.group(1)
|
|
if len(extracted_query) > 10: # Ensure it's a meaningful query
|
|
search_query = extracted_query
|
|
logger.info(f"Extracted search query from moderator: {search_query}")
|
|
break
|
|
|
|
# Always perform the web scraping
|
|
scraped_data = await aiweb_scraper.scrape_web_data(search_query)
|
|
|
|
# Mark that scraping was performed
|
|
return True, scraped_data
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error ensuring web scraping for scraper agent: {e}")
|
|
return True, f"Web scraping failed: {str(e)}"
|
|
|
|
|
|
async def close(self):
|
|
"""
|
|
Schließt alle offenen Ressourcen.
|
|
"""
|
|
# Currently no resources to close
|
|
pass |