398 lines
No EOL
17 KiB
Python
398 lines
No EOL
17 KiB
Python
from typing import Dict, Any, Optional
|
|
import logging
|
|
import aiohttp
|
|
import asyncio
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin, urlparse
|
|
import re
|
|
from datetime import datetime, UTC
|
|
import requests
|
|
import time
|
|
|
|
from modules.methods.methodBase import MethodBase, AuthSource, MethodResult
|
|
from modules.shared.configuration import APP_CONFIG
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class MethodWeb(MethodBase):
|
|
"""Web method implementation for web operations"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.name = "web"
|
|
self.description = "Handle web operations like search, crawl, and content extraction"
|
|
self.auth_source = AuthSource.LOCAL # Web operations typically don't need auth
|
|
|
|
# Web crawling configuration from agentWebcrawler
|
|
self.srcApikey = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_APIKEY", "")
|
|
self.srcEngine = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_ENGINE", "google")
|
|
self.srcCountry = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_COUNTRY", "auto")
|
|
self.maxResults = int(APP_CONFIG.get("Agent_Webcrawler_SERPAPI_MAX_SEARCH_RESULTS", "5"))
|
|
self.timeout = int(APP_CONFIG.get("Agent_Webcrawler_SERPAPI_TIMEOUT", "30"))
|
|
self.userAgent = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
|
|
|
if not self.srcApikey:
|
|
logger.error("SerpAPI key not configured")
|
|
|
|
@property
|
|
def actions(self) -> Dict[str, Dict[str, Any]]:
|
|
"""Available actions and their parameters"""
|
|
return {
|
|
"search": {
|
|
"description": "Search web content",
|
|
"retryMax": 3,
|
|
"timeout": 30,
|
|
"parameters": {
|
|
"query": {"type": "string", "required": True},
|
|
"maxResults": {"type": "number", "required": False},
|
|
"filters": {"type": "object", "required": False},
|
|
"searchEngine": {"type": "string", "required": False}
|
|
}
|
|
},
|
|
"crawl": {
|
|
"description": "Crawl web pages",
|
|
"retryMax": 2,
|
|
"timeout": 60,
|
|
"parameters": {
|
|
"url": {"type": "string", "required": True},
|
|
"depth": {"type": "number", "required": False},
|
|
"followLinks": {"type": "boolean", "required": False},
|
|
"includeImages": {"type": "boolean", "required": False},
|
|
"respectRobots": {"type": "boolean", "required": False}
|
|
}
|
|
},
|
|
"extract": {
|
|
"description": "Extract content from web page",
|
|
"retryMax": 2,
|
|
"timeout": 30,
|
|
"parameters": {
|
|
"url": {"type": "string", "required": True},
|
|
"selectors": {"type": "array", "items": "string", "required": False},
|
|
"format": {"type": "string", "required": False},
|
|
"includeMetadata": {"type": "boolean", "required": False}
|
|
}
|
|
}
|
|
}
|
|
|
|
async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> MethodResult:
|
|
"""Execute web method"""
|
|
try:
|
|
# Validate parameters
|
|
if not await self.validate_parameters(action, parameters):
|
|
return self._create_result(
|
|
success=False,
|
|
data={"error": f"Invalid parameters for {action}"}
|
|
)
|
|
|
|
# Execute action
|
|
if action == "search":
|
|
return await self._search_web(parameters)
|
|
elif action == "crawl":
|
|
return await self._crawl_page(parameters)
|
|
elif action == "extract":
|
|
return await self._extract_content(parameters)
|
|
else:
|
|
return self._create_result(
|
|
success=False,
|
|
data={"error": f"Unknown action: {action}"}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error executing web {action}: {e}")
|
|
return self._create_result(
|
|
success=False,
|
|
data={"error": str(e)}
|
|
)
|
|
|
|
async def _search_web(self, parameters: Dict[str, Any]) -> MethodResult:
|
|
"""Search web content"""
|
|
try:
|
|
query = parameters["query"]
|
|
max_results = parameters.get("maxResults", 10)
|
|
filters = parameters.get("filters", {})
|
|
search_engine = parameters.get("searchEngine", "google")
|
|
|
|
# Implement search using different engines
|
|
if search_engine.lower() == "google":
|
|
# Use Google Custom Search API
|
|
# TODO: Implement Google Custom Search API integration
|
|
results = await self._google_search(query, max_results, filters)
|
|
elif search_engine.lower() == "bing":
|
|
# Use Bing Web Search API
|
|
# TODO: Implement Bing Web Search API integration
|
|
results = await self._bing_search(query, max_results, filters)
|
|
else:
|
|
return self._create_result(
|
|
success=False,
|
|
data={"error": f"Unsupported search engine: {search_engine}"}
|
|
)
|
|
|
|
return self._create_result(
|
|
success=True,
|
|
data={
|
|
"query": query,
|
|
"engine": search_engine,
|
|
"results": results
|
|
}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error searching web: {e}")
|
|
return self._create_result(
|
|
success=False,
|
|
data={"error": f"Search failed: {str(e)}"}
|
|
)
|
|
|
|
async def _google_search(self, query: str, max_results: int, filters: Dict[str, Any]) -> list:
|
|
"""Search using Google Custom Search API"""
|
|
# TODO: Implement Google Custom Search API
|
|
# This is a placeholder implementation
|
|
return [
|
|
{
|
|
"title": "Example Result",
|
|
"url": "https://example.com",
|
|
"snippet": "Example search result snippet",
|
|
"source": "google"
|
|
}
|
|
]
|
|
|
|
async def _bing_search(self, query: str, max_results: int, filters: Dict[str, Any]) -> list:
|
|
"""Search using Bing Web Search API"""
|
|
# TODO: Implement Bing Web Search API
|
|
# This is a placeholder implementation
|
|
return [
|
|
{
|
|
"title": "Example Result",
|
|
"url": "https://example.com",
|
|
"snippet": "Example search result snippet",
|
|
"source": "bing"
|
|
}
|
|
]
|
|
|
|
async def _crawl_page(self, parameters: Dict[str, Any]) -> MethodResult:
|
|
"""Crawl web pages"""
|
|
try:
|
|
url = parameters["url"]
|
|
depth = parameters.get("depth", 1)
|
|
follow_links = parameters.get("followLinks", False)
|
|
include_images = parameters.get("includeImages", False)
|
|
respect_robots = parameters.get("respectRobots", True)
|
|
|
|
# Check robots.txt if required
|
|
if respect_robots:
|
|
if not await self._check_robots_txt(url):
|
|
return self._create_result(
|
|
success=False,
|
|
data={"error": "Crawling not allowed by robots.txt"}
|
|
)
|
|
|
|
# Crawl the page
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(url) as response:
|
|
if response.status == 200:
|
|
html = await response.text()
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Extract basic information
|
|
result = {
|
|
"url": url,
|
|
"title": soup.title.string if soup.title else None,
|
|
"description": self._get_meta_description(soup),
|
|
"links": [],
|
|
"images": [] if include_images else None,
|
|
"text": soup.get_text(strip=True),
|
|
"crawled": datetime.now(UTC).isoformat()
|
|
}
|
|
|
|
# Extract links if followLinks is True
|
|
if follow_links:
|
|
base_url = url
|
|
for link in soup.find_all('a'):
|
|
href = link.get('href')
|
|
if href:
|
|
absolute_url = urljoin(base_url, href)
|
|
if self._is_valid_url(absolute_url):
|
|
result["links"].append({
|
|
"url": absolute_url,
|
|
"text": link.get_text(strip=True)
|
|
})
|
|
|
|
# Extract images if includeImages is True
|
|
if include_images:
|
|
for img in soup.find_all('img'):
|
|
src = img.get('src')
|
|
if src:
|
|
absolute_src = urljoin(url, src)
|
|
result["images"].append({
|
|
"url": absolute_src,
|
|
"alt": img.get('alt', ''),
|
|
"title": img.get('title', '')
|
|
})
|
|
|
|
return self._create_result(
|
|
success=True,
|
|
data=result
|
|
)
|
|
else:
|
|
return self._create_result(
|
|
success=False,
|
|
data={"error": f"Failed to fetch URL: {response.status}"}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error crawling page: {e}")
|
|
return self._create_result(
|
|
success=False,
|
|
data={"error": f"Crawl failed: {str(e)}"}
|
|
)
|
|
|
|
async def _extract_content(self, parameters: Dict[str, Any]) -> MethodResult:
|
|
"""Extract content from web page"""
|
|
try:
|
|
url = parameters["url"]
|
|
selectors = parameters.get("selectors")
|
|
format = parameters.get("format", "text")
|
|
include_metadata = parameters.get("includeMetadata", False)
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(url) as response:
|
|
if response.status == 200:
|
|
html = await response.text()
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Extract content based on selectors
|
|
content = {}
|
|
if selectors:
|
|
for selector in selectors:
|
|
elements = soup.select(selector)
|
|
content[selector] = [elem.get_text() for elem in elements]
|
|
else:
|
|
# Default extraction
|
|
content = {
|
|
"title": soup.title.string if soup.title else None,
|
|
"text": soup.get_text(strip=True),
|
|
"links": [a.get('href') for a in soup.find_all('a')]
|
|
}
|
|
|
|
# Add metadata if requested
|
|
if include_metadata:
|
|
content["metadata"] = {
|
|
"url": url,
|
|
"crawled": datetime.now(UTC).isoformat(),
|
|
"language": self._detect_language(soup),
|
|
"wordCount": len(content["text"].split()),
|
|
"linksCount": len(content["links"])
|
|
}
|
|
|
|
return self._create_result(
|
|
success=True,
|
|
data={
|
|
"url": url,
|
|
"content": content
|
|
}
|
|
)
|
|
else:
|
|
return self._create_result(
|
|
success=False,
|
|
data={"error": f"Failed to fetch URL: {response.status}"}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content: {e}")
|
|
return self._create_result(
|
|
success=False,
|
|
data={"error": f"Extraction failed: {str(e)}"}
|
|
)
|
|
|
|
def _get_meta_description(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract meta description from HTML"""
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
if meta_desc:
|
|
return meta_desc.get('content')
|
|
return None
|
|
|
|
def _is_valid_url(self, url: str) -> bool:
|
|
"""Check if URL is valid"""
|
|
try:
|
|
result = urlparse(url)
|
|
return all([result.scheme, result.netloc])
|
|
except:
|
|
return False
|
|
|
|
async def _check_robots_txt(self, url: str) -> bool:
|
|
"""Check if URL is allowed by robots.txt"""
|
|
try:
|
|
parsed_url = urlparse(url)
|
|
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(robots_url, headers={"User-Agent": self.userAgent}, timeout=self.timeout) as response:
|
|
if response.status == 200:
|
|
robots_content = await response.text()
|
|
|
|
# Parse robots.txt content
|
|
user_agent = "*" # Default to all user agents
|
|
disallow_paths = []
|
|
|
|
for line in robots_content.splitlines():
|
|
line = line.strip().lower()
|
|
if line.startswith("user-agent:"):
|
|
user_agent = line[11:].strip()
|
|
elif line.startswith("disallow:") and user_agent in ["*", self.userAgent.lower()]:
|
|
path = line[9:].strip()
|
|
if path:
|
|
disallow_paths.append(path)
|
|
|
|
# Check if URL path is disallowed
|
|
url_path = parsed_url.path
|
|
for disallow_path in disallow_paths:
|
|
if url_path.startswith(disallow_path):
|
|
return False
|
|
|
|
return True
|
|
else:
|
|
# If robots.txt doesn't exist, assume crawling is allowed
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error checking robots.txt for {url}: {str(e)}")
|
|
# If there's an error, assume crawling is allowed
|
|
return True
|
|
|
|
def _detect_language(self, soup: BeautifulSoup) -> str:
|
|
"""Detect page language"""
|
|
try:
|
|
# Try to get language from HTML lang attribute
|
|
if soup.html and soup.html.get('lang'):
|
|
return soup.html.get('lang')
|
|
|
|
# Try to get language from meta tag
|
|
meta_lang = soup.find('meta', attrs={'http-equiv': 'content-language'})
|
|
if meta_lang:
|
|
return meta_lang.get('content', 'en')
|
|
|
|
# Try to get language from meta charset
|
|
meta_charset = soup.find('meta', attrs={'charset': True})
|
|
if meta_charset:
|
|
charset = meta_charset.get('charset', '').lower()
|
|
if 'utf-8' in charset:
|
|
return 'en' # Default to English for UTF-8
|
|
|
|
# Try to detect language from content
|
|
# This is a simple heuristic based on common words
|
|
text = soup.get_text().lower()
|
|
common_words = {
|
|
'en': ['the', 'and', 'of', 'to', 'in', 'is', 'that', 'for', 'it', 'with'],
|
|
'es': ['el', 'la', 'los', 'las', 'de', 'y', 'en', 'que', 'por', 'con'],
|
|
'fr': ['le', 'la', 'les', 'de', 'et', 'en', 'que', 'pour', 'avec', 'dans'],
|
|
'de': ['der', 'die', 'das', 'und', 'in', 'den', 'von', 'zu', 'für', 'mit']
|
|
}
|
|
|
|
word_counts = {lang: sum(1 for word in words if f' {word} ' in f' {text} ')
|
|
for lang, words in common_words.items()}
|
|
|
|
if word_counts:
|
|
return max(word_counts.items(), key=lambda x: x[1])[0]
|
|
|
|
return 'en' # Default to English if no language detected
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error detecting language: {str(e)}")
|
|
return 'en' # Default to English on error |