gateway/modules/methods/methodWeb.py

from typing import Dict, Any, Optional
import logging
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from datetime import datetime, UTC
import requests
import time

from modules.methods.methodBase import MethodBase, AuthSource, MethodResult
from modules.shared.configuration import APP_CONFIG

logger = logging.getLogger(__name__)

class MethodWeb(MethodBase):
    """Web method implementation for web operations"""

    def __init__(self):
        super().__init__()
        self.name = "web"
        self.description = "Handle web operations like search, crawl, and content extraction"
        self.auth_source = AuthSource.LOCAL  # Web operations typically don't need auth

        # Web crawling configuration from agentWebcrawler
        self.srcApikey = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_APIKEY", "")
        self.srcEngine = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_ENGINE", "google")
        self.srcCountry = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_COUNTRY", "auto")
        self.maxResults = int(APP_CONFIG.get("Agent_Webcrawler_SERPAPI_MAX_SEARCH_RESULTS", "5"))
        self.timeout = int(APP_CONFIG.get("Agent_Webcrawler_SERPAPI_TIMEOUT", "30"))
        self.userAgent = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")

        if not self.srcApikey:
            logger.error("SerpAPI key not configured")

    @property
    def actions(self) -> Dict[str, Dict[str, Any]]:
        """Available actions and their parameters"""
        return {
            "search": {
                "description": "Search web content",
                "retryMax": 3,
                "timeout": 30,
                "parameters": {
                    "query": {"type": "string", "required": True},
                    "maxResults": {"type": "number", "required": False},
                    "filters": {"type": "object", "required": False},
                    "searchEngine": {"type": "string", "required": False}
                }
            },
            "crawl": {
                "description": "Crawl web pages",
                "retryMax": 2,
                "timeout": 60,
                "parameters": {
                    "url": {"type": "string", "required": True},
                    "depth": {"type": "number", "required": False},
                    "followLinks": {"type": "boolean", "required": False},
                    "includeImages": {"type": "boolean", "required": False},
                    "respectRobots": {"type": "boolean", "required": False}
                }
            },
            "extract": {
                "description": "Extract content from web page",
                "retryMax": 2,
                "timeout": 30,
                "parameters": {
                    "url": {"type": "string", "required": True},
                    "selectors": {"type": "array", "items": "string", "required": False},
                    "format": {"type": "string", "required": False},
                    "includeMetadata": {"type": "boolean", "required": False}
                }
            }
        }

    async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> MethodResult:
        """Execute web method"""
        try:
            # Validate parameters
            if not await self.validate_parameters(action, parameters):
                return self._create_result(
                    success=False,
                    data={"error": f"Invalid parameters for {action}"}
                )

            # Execute action
            if action == "search":
                return await self._search_web(parameters)
            elif action == "crawl":
                return await self._crawl_page(parameters)
            elif action == "extract":
                return await self._extract_content(parameters)
            else:
                return self._create_result(
                    success=False,
                    data={"error": f"Unknown action: {action}"}
                )

        except Exception as e:
            logger.error(f"Error executing web {action}: {e}")
            return self._create_result(
                success=False,
                data={"error": str(e)}
            )

    async def _search_web(self, parameters: Dict[str, Any]) -> MethodResult:
        """Search web content"""
        try:
            query = parameters["query"]
            max_results = parameters.get("maxResults", 10)
            filters = parameters.get("filters", {})
            search_engine = parameters.get("searchEngine", "google")

            # Implement search using different engines
            if search_engine.lower() == "google":
                # Use Google Custom Search API
                # TODO: Implement Google Custom Search API integration
                results = await self._google_search(query, max_results, filters)
            elif search_engine.lower() == "bing":
                # Use Bing Web Search API
                # TODO: Implement Bing Web Search API integration
                results = await self._bing_search(query, max_results, filters)
            else:
                return self._create_result(
                    success=False,
                    data={"error": f"Unsupported search engine: {search_engine}"}
                )

            return self._create_result(
                success=True,
                data={
                    "query": query,
                    "engine": search_engine,
                    "results": results
                }
            )
        except Exception as e:
            logger.error(f"Error searching web: {e}")
            return self._create_result(
                success=False,
                data={"error": f"Search failed: {str(e)}"}
            )

    async def _google_search(self, query: str, max_results: int, filters: Dict[str, Any]) -> list:
        """Search using Google Custom Search API"""
        # TODO: Implement Google Custom Search API
        # This is a placeholder implementation
        return [
            {
                "title": "Example Result",
                "url": "https://example.com",
                "snippet": "Example search result snippet",
                "source": "google"
            }
        ]

    async def _bing_search(self, query: str, max_results: int, filters: Dict[str, Any]) -> list:
        """Search using Bing Web Search API"""
        # TODO: Implement Bing Web Search API
        # This is a placeholder implementation
        return [
            {
                "title": "Example Result",
                "url": "https://example.com",
                "snippet": "Example search result snippet",
                "source": "bing"
            }
        ]

    async def _crawl_page(self, parameters: Dict[str, Any]) -> MethodResult:
        """Crawl web pages"""
        try:
            url = parameters["url"]
            depth = parameters.get("depth", 1)
            follow_links = parameters.get("followLinks", False)
            include_images = parameters.get("includeImages", False)
            respect_robots = parameters.get("respectRobots", True)

            # Check robots.txt if required
            if respect_robots:
                if not await self._check_robots_txt(url):
                    return self._create_result(
                        success=False,
                        data={"error": "Crawling not allowed by robots.txt"}
                    )

            # Crawl the page
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    if response.status == 200:
                        html = await response.text()
                        soup = BeautifulSoup(html, 'html.parser')

                        # Extract basic information
                        result = {
                            "url": url,
                            "title": soup.title.string if soup.title else None,
                            "description": self._get_meta_description(soup),
                            "links": [],
                            "images": [] if include_images else None,
                            "text": soup.get_text(strip=True),
                            "crawled": datetime.now(UTC).isoformat()
                        }

                        # Extract links if followLinks is True
                        if follow_links:
                            base_url = url
                            for link in soup.find_all('a'):
                                href = link.get('href')
                                if href:
                                    absolute_url = urljoin(base_url, href)
                                    if self._is_valid_url(absolute_url):
                                        result["links"].append({
                                            "url": absolute_url,
                                            "text": link.get_text(strip=True)
                                        })

                        # Extract images if includeImages is True
                        if include_images:
                            for img in soup.find_all('img'):
                                src = img.get('src')
                                if src:
                                    absolute_src = urljoin(url, src)
                                    result["images"].append({
                                        "url": absolute_src,
                                        "alt": img.get('alt', ''),
                                        "title": img.get('title', '')
                                    })

                        return self._create_result(
                            success=True,
                            data=result
                        )
                    else:
                        return self._create_result(
                            success=False,
                            data={"error": f"Failed to fetch URL: {response.status}"}
                        )
        except Exception as e:
            logger.error(f"Error crawling page: {e}")
            return self._create_result(
                success=False,
                data={"error": f"Crawl failed: {str(e)}"}
            )

    async def _extract_content(self, parameters: Dict[str, Any]) -> MethodResult:
        """Extract content from web page"""
        try:
            url = parameters["url"]
            selectors = parameters.get("selectors")
            format = parameters.get("format", "text")
            include_metadata = parameters.get("includeMetadata", False)

            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    if response.status == 200:
                        html = await response.text()
                        soup = BeautifulSoup(html, 'html.parser')

                        # Extract content based on selectors
                        content = {}
                        if selectors:
                            for selector in selectors:
                                elements = soup.select(selector)
                                content[selector] = [elem.get_text() for elem in elements]
                        else:
                            # Default extraction
                            content = {
                                "title": soup.title.string if soup.title else None,
                                "text": soup.get_text(strip=True),
                                "links": [a.get('href') for a in soup.find_all('a')]
                            }

                        # Add metadata if requested
                        if include_metadata:
                            content["metadata"] = {
                                "url": url,
                                "crawled": datetime.now(UTC).isoformat(),
                                "language": self._detect_language(soup),
                                "wordCount": len(content["text"].split()),
                                "linksCount": len(content["links"])
                            }

                        return self._create_result(
                            success=True,
                            data={
                                "url": url,
                                "content": content
                            }
                        )
                    else:
                        return self._create_result(
                            success=False,
                            data={"error": f"Failed to fetch URL: {response.status}"}
                        )
        except Exception as e:
            logger.error(f"Error extracting content: {e}")
            return self._create_result(
                success=False,
                data={"error": f"Extraction failed: {str(e)}"}
            )

    def _get_meta_description(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract meta description from HTML"""
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if meta_desc:
            return meta_desc.get('content')
        return None

    def _is_valid_url(self, url: str) -> bool:
        """Check if URL is valid"""
        try:
            result = urlparse(url)
            return all([result.scheme, result.netloc])
        except:
            return False

    async def _check_robots_txt(self, url: str) -> bool:
        """Check if URL is allowed by robots.txt"""
        try:
            parsed_url = urlparse(url)
            robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"

            async with aiohttp.ClientSession() as session:
                async with session.get(robots_url, headers={"User-Agent": self.userAgent}, timeout=self.timeout) as response:
                    if response.status == 200:
                        robots_content = await response.text()

                        # Parse robots.txt content
                        user_agent = "*"  # Default to all user agents
                        disallow_paths = []

                        for line in robots_content.splitlines():
                            line = line.strip().lower()
                            if line.startswith("user-agent:"):
                                user_agent = line[11:].strip()
                            elif line.startswith("disallow:") and user_agent in ["*", self.userAgent.lower()]:
                                path = line[9:].strip()
                                if path:
                                    disallow_paths.append(path)

                        # Check if URL path is disallowed
                        url_path = parsed_url.path
                        for disallow_path in disallow_paths:
                            if url_path.startswith(disallow_path):
                                return False

                        return True
                    else:
                        # If robots.txt doesn't exist, assume crawling is allowed
                        return True

        except Exception as e:
            logger.warning(f"Error checking robots.txt for {url}: {str(e)}")
            # If there's an error, assume crawling is allowed
            return True

    def _detect_language(self, soup: BeautifulSoup) -> str:
        """Detect page language"""
        try:
            # Try to get language from HTML lang attribute
            if soup.html and soup.html.get('lang'):
                return soup.html.get('lang')

            # Try to get language from meta tag
            meta_lang = soup.find('meta', attrs={'http-equiv': 'content-language'})
            if meta_lang:
                return meta_lang.get('content', 'en')

            # Try to get language from meta charset
            meta_charset = soup.find('meta', attrs={'charset': True})
            if meta_charset:
                charset = meta_charset.get('charset', '').lower()
                if 'utf-8' in charset:
                    return 'en'  # Default to English for UTF-8

            # Try to detect language from content
            # This is a simple heuristic based on common words
            text = soup.get_text().lower()
            common_words = {
                'en': ['the', 'and', 'of', 'to', 'in', 'is', 'that', 'for', 'it', 'with'],
                'es': ['el', 'la', 'los', 'las', 'de', 'y', 'en', 'que', 'por', 'con'],
                'fr': ['le', 'la', 'les', 'de', 'et', 'en', 'que', 'pour', 'avec', 'dans'],
                'de': ['der', 'die', 'das', 'und', 'in', 'den', 'von', 'zu', 'für', 'mit']
            }

            word_counts = {lang: sum(1 for word in words if f' {word} ' in f' {text} ')
                          for lang, words in common_words.items()}

            if word_counts:
                return max(word_counts.items(), key=lambda x: x[1])[0]

            return 'en'  # Default to English if no language detected

        except Exception as e:
            logger.warning(f"Error detecting language: {str(e)}")
            return 'en'  # Default to English on error