gateway/modules/methods/methodWeb.py
2025-06-10 01:25:32 +02:00

398 lines
No EOL
17 KiB
Python

from typing import Dict, Any, Optional
import logging
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from datetime import datetime, UTC
import requests
import time
from modules.methods.methodBase import MethodBase, AuthSource, MethodResult
from modules.shared.configuration import APP_CONFIG
logger = logging.getLogger(__name__)
class MethodWeb(MethodBase):
"""Web method implementation for web operations"""
def __init__(self):
super().__init__()
self.name = "web"
self.description = "Handle web operations like search, crawl, and content extraction"
self.auth_source = AuthSource.LOCAL # Web operations typically don't need auth
# Web crawling configuration from agentWebcrawler
self.srcApikey = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_APIKEY", "")
self.srcEngine = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_ENGINE", "google")
self.srcCountry = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_COUNTRY", "auto")
self.maxResults = int(APP_CONFIG.get("Agent_Webcrawler_SERPAPI_MAX_SEARCH_RESULTS", "5"))
self.timeout = int(APP_CONFIG.get("Agent_Webcrawler_SERPAPI_TIMEOUT", "30"))
self.userAgent = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
if not self.srcApikey:
logger.error("SerpAPI key not configured")
@property
def actions(self) -> Dict[str, Dict[str, Any]]:
"""Available actions and their parameters"""
return {
"search": {
"description": "Search web content",
"retryMax": 3,
"timeout": 30,
"parameters": {
"query": {"type": "string", "required": True},
"maxResults": {"type": "number", "required": False},
"filters": {"type": "object", "required": False},
"searchEngine": {"type": "string", "required": False}
}
},
"crawl": {
"description": "Crawl web pages",
"retryMax": 2,
"timeout": 60,
"parameters": {
"url": {"type": "string", "required": True},
"depth": {"type": "number", "required": False},
"followLinks": {"type": "boolean", "required": False},
"includeImages": {"type": "boolean", "required": False},
"respectRobots": {"type": "boolean", "required": False}
}
},
"extract": {
"description": "Extract content from web page",
"retryMax": 2,
"timeout": 30,
"parameters": {
"url": {"type": "string", "required": True},
"selectors": {"type": "array", "items": "string", "required": False},
"format": {"type": "string", "required": False},
"includeMetadata": {"type": "boolean", "required": False}
}
}
}
async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> MethodResult:
"""Execute web method"""
try:
# Validate parameters
if not await self.validate_parameters(action, parameters):
return self._create_result(
success=False,
data={"error": f"Invalid parameters for {action}"}
)
# Execute action
if action == "search":
return await self._search_web(parameters)
elif action == "crawl":
return await self._crawl_page(parameters)
elif action == "extract":
return await self._extract_content(parameters)
else:
return self._create_result(
success=False,
data={"error": f"Unknown action: {action}"}
)
except Exception as e:
logger.error(f"Error executing web {action}: {e}")
return self._create_result(
success=False,
data={"error": str(e)}
)
async def _search_web(self, parameters: Dict[str, Any]) -> MethodResult:
"""Search web content"""
try:
query = parameters["query"]
max_results = parameters.get("maxResults", 10)
filters = parameters.get("filters", {})
search_engine = parameters.get("searchEngine", "google")
# Implement search using different engines
if search_engine.lower() == "google":
# Use Google Custom Search API
# TODO: Implement Google Custom Search API integration
results = await self._google_search(query, max_results, filters)
elif search_engine.lower() == "bing":
# Use Bing Web Search API
# TODO: Implement Bing Web Search API integration
results = await self._bing_search(query, max_results, filters)
else:
return self._create_result(
success=False,
data={"error": f"Unsupported search engine: {search_engine}"}
)
return self._create_result(
success=True,
data={
"query": query,
"engine": search_engine,
"results": results
}
)
except Exception as e:
logger.error(f"Error searching web: {e}")
return self._create_result(
success=False,
data={"error": f"Search failed: {str(e)}"}
)
async def _google_search(self, query: str, max_results: int, filters: Dict[str, Any]) -> list:
"""Search using Google Custom Search API"""
# TODO: Implement Google Custom Search API
# This is a placeholder implementation
return [
{
"title": "Example Result",
"url": "https://example.com",
"snippet": "Example search result snippet",
"source": "google"
}
]
async def _bing_search(self, query: str, max_results: int, filters: Dict[str, Any]) -> list:
"""Search using Bing Web Search API"""
# TODO: Implement Bing Web Search API
# This is a placeholder implementation
return [
{
"title": "Example Result",
"url": "https://example.com",
"snippet": "Example search result snippet",
"source": "bing"
}
]
async def _crawl_page(self, parameters: Dict[str, Any]) -> MethodResult:
"""Crawl web pages"""
try:
url = parameters["url"]
depth = parameters.get("depth", 1)
follow_links = parameters.get("followLinks", False)
include_images = parameters.get("includeImages", False)
respect_robots = parameters.get("respectRobots", True)
# Check robots.txt if required
if respect_robots:
if not await self._check_robots_txt(url):
return self._create_result(
success=False,
data={"error": "Crawling not allowed by robots.txt"}
)
# Crawl the page
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 200:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
# Extract basic information
result = {
"url": url,
"title": soup.title.string if soup.title else None,
"description": self._get_meta_description(soup),
"links": [],
"images": [] if include_images else None,
"text": soup.get_text(strip=True),
"crawled": datetime.now(UTC).isoformat()
}
# Extract links if followLinks is True
if follow_links:
base_url = url
for link in soup.find_all('a'):
href = link.get('href')
if href:
absolute_url = urljoin(base_url, href)
if self._is_valid_url(absolute_url):
result["links"].append({
"url": absolute_url,
"text": link.get_text(strip=True)
})
# Extract images if includeImages is True
if include_images:
for img in soup.find_all('img'):
src = img.get('src')
if src:
absolute_src = urljoin(url, src)
result["images"].append({
"url": absolute_src,
"alt": img.get('alt', ''),
"title": img.get('title', '')
})
return self._create_result(
success=True,
data=result
)
else:
return self._create_result(
success=False,
data={"error": f"Failed to fetch URL: {response.status}"}
)
except Exception as e:
logger.error(f"Error crawling page: {e}")
return self._create_result(
success=False,
data={"error": f"Crawl failed: {str(e)}"}
)
async def _extract_content(self, parameters: Dict[str, Any]) -> MethodResult:
"""Extract content from web page"""
try:
url = parameters["url"]
selectors = parameters.get("selectors")
format = parameters.get("format", "text")
include_metadata = parameters.get("includeMetadata", False)
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 200:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
# Extract content based on selectors
content = {}
if selectors:
for selector in selectors:
elements = soup.select(selector)
content[selector] = [elem.get_text() for elem in elements]
else:
# Default extraction
content = {
"title": soup.title.string if soup.title else None,
"text": soup.get_text(strip=True),
"links": [a.get('href') for a in soup.find_all('a')]
}
# Add metadata if requested
if include_metadata:
content["metadata"] = {
"url": url,
"crawled": datetime.now(UTC).isoformat(),
"language": self._detect_language(soup),
"wordCount": len(content["text"].split()),
"linksCount": len(content["links"])
}
return self._create_result(
success=True,
data={
"url": url,
"content": content
}
)
else:
return self._create_result(
success=False,
data={"error": f"Failed to fetch URL: {response.status}"}
)
except Exception as e:
logger.error(f"Error extracting content: {e}")
return self._create_result(
success=False,
data={"error": f"Extraction failed: {str(e)}"}
)
def _get_meta_description(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract meta description from HTML"""
meta_desc = soup.find('meta', attrs={'name': 'description'})
if meta_desc:
return meta_desc.get('content')
return None
def _is_valid_url(self, url: str) -> bool:
"""Check if URL is valid"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
async def _check_robots_txt(self, url: str) -> bool:
"""Check if URL is allowed by robots.txt"""
try:
parsed_url = urlparse(url)
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
async with aiohttp.ClientSession() as session:
async with session.get(robots_url, headers={"User-Agent": self.userAgent}, timeout=self.timeout) as response:
if response.status == 200:
robots_content = await response.text()
# Parse robots.txt content
user_agent = "*" # Default to all user agents
disallow_paths = []
for line in robots_content.splitlines():
line = line.strip().lower()
if line.startswith("user-agent:"):
user_agent = line[11:].strip()
elif line.startswith("disallow:") and user_agent in ["*", self.userAgent.lower()]:
path = line[9:].strip()
if path:
disallow_paths.append(path)
# Check if URL path is disallowed
url_path = parsed_url.path
for disallow_path in disallow_paths:
if url_path.startswith(disallow_path):
return False
return True
else:
# If robots.txt doesn't exist, assume crawling is allowed
return True
except Exception as e:
logger.warning(f"Error checking robots.txt for {url}: {str(e)}")
# If there's an error, assume crawling is allowed
return True
def _detect_language(self, soup: BeautifulSoup) -> str:
"""Detect page language"""
try:
# Try to get language from HTML lang attribute
if soup.html and soup.html.get('lang'):
return soup.html.get('lang')
# Try to get language from meta tag
meta_lang = soup.find('meta', attrs={'http-equiv': 'content-language'})
if meta_lang:
return meta_lang.get('content', 'en')
# Try to get language from meta charset
meta_charset = soup.find('meta', attrs={'charset': True})
if meta_charset:
charset = meta_charset.get('charset', '').lower()
if 'utf-8' in charset:
return 'en' # Default to English for UTF-8
# Try to detect language from content
# This is a simple heuristic based on common words
text = soup.get_text().lower()
common_words = {
'en': ['the', 'and', 'of', 'to', 'in', 'is', 'that', 'for', 'it', 'with'],
'es': ['el', 'la', 'los', 'las', 'de', 'y', 'en', 'que', 'por', 'con'],
'fr': ['le', 'la', 'les', 'de', 'et', 'en', 'que', 'pour', 'avec', 'dans'],
'de': ['der', 'die', 'das', 'und', 'in', 'den', 'von', 'zu', 'für', 'mit']
}
word_counts = {lang: sum(1 for word in words if f' {word} ' in f' {text} ')
for lang, words in common_words.items()}
if word_counts:
return max(word_counts.items(), key=lambda x: x[1])[0]
return 'en' # Default to English if no language detected
except Exception as e:
logger.warning(f"Error detecting language: {str(e)}")
return 'en' # Default to English on error