gateway/modules/methods/methodWeb.py

from typing import Dict, Any, Optional
import logging
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from datetime import datetime, UTC
import requests
import time
import json

from modules.methods.methodBase import MethodBase, MethodResult
from modules.shared.configuration import APP_CONFIG

logger = logging.getLogger(__name__)

class MethodWeb(MethodBase):
    """Web method implementation for web operations"""

    def __init__(self):
        super().__init__()
        self.name = "web"
        self.description = "Handle web operations like search, crawl, and content extraction"

        # Web crawling configuration from agentWebcrawler
        self.srcApikey = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_APIKEY", "")
        self.srcEngine = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_ENGINE", "google")
        self.srcCountry = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_COUNTRY", "auto")
        self.maxResults = int(APP_CONFIG.get("Agent_Webcrawler_SERPAPI_MAX_SEARCH_RESULTS", "5"))
        self.timeout = int(APP_CONFIG.get("Agent_Webcrawler_SERPAPI_TIMEOUT", "30"))
        self.userAgent = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")

        if not self.srcApikey:
            logger.error("SerpAPI key not configured")

    @property
    def actions(self) -> Dict[str, Dict[str, Any]]:
        """Available actions and their parameters"""
        return {
            "search": {
                "description": "Search web content",
                "retryMax": 3,
                "timeout": 30,
                "parameters": {
                    "query": {"type": "string", "required": True},
                    "maxResults": {"type": "number", "required": False},
                    "filters": {"type": "object", "required": False},
                    "searchEngine": {"type": "string", "required": False}
                }
            },
            "crawl": {
                "description": "Crawl web pages",
                "retryMax": 2,
                "timeout": 60,
                "parameters": {
                    "url": {"type": "string", "required": True},
                    "depth": {"type": "number", "required": False},
                    "followLinks": {"type": "boolean", "required": False},
                    "includeImages": {"type": "boolean", "required": False},
                    "respectRobots": {"type": "boolean", "required": False}
                }
            },
            "extract": {
                "description": "Extract content from web page",
                "retryMax": 2,
                "timeout": 30,
                "parameters": {
                    "url": {"type": "string", "required": True},
                    "selectors": {"type": "array", "items": "string", "required": False},
                    "format": {"type": "string", "required": False},
                    "includeMetadata": {"type": "boolean", "required": False}
                }
            }
        }

    async def execute(self, action: str, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> MethodResult:
        """Execute web method"""
        try:
            # Validate parameters
            if not await self.validateParameters(action, parameters):
                return self._createResult(
                    success=False,
                    data={"error": f"Invalid parameters for {action}"}
                )

            # Execute action
            if action == "fetchUrl":
                return await self._fetchUrl(parameters)
            elif action == "parseContent":
                return await self._parseContent(parameters)
            elif action == "extractData":
                return await self._extractData(parameters)
            else:
                return self._createResult(
                    success=False,
                    data={"error": f"Unknown action: {action}"}
                )

        except Exception as e:
            logger.error(f"Error executing web {action}: {e}")
            return self._createResult(
                success=False,
                data={"error": str(e)}
            )

    async def _fetchUrl(self, parameters: Dict[str, Any]) -> MethodResult:
        """Fetch content from URL"""
        try:
            url = parameters["url"]
            method = parameters.get("method", "GET")
            headers = parameters.get("headers", {})
            data = parameters.get("data")
            timeout = parameters.get("timeout", 30)

            async with aiohttp.ClientSession() as session:
                async with session.request(
                    method=method,
                    url=url,
                    headers=headers,
                    data=data,
                    timeout=timeout
                ) as response:
                    content = await response.text()
                    return self._createResult(
                        success=True,
                        data={
                            "url": url,
                            "status": response.status,
                            "headers": dict(response.headers),
                            "content": content
                        }
                    )
        except Exception as e:
            logger.error(f"Error fetching URL: {e}")
            return self._createResult(
                success=False,
                data={"error": f"Fetch failed: {str(e)}"}
            )

    async def _parseContent(self, parameters: Dict[str, Any]) -> MethodResult:
        """Parse web content"""
        try:
            content = parameters["content"]
            contentType = parameters.get("contentType", "html")

            if contentType == "html":
                soup = BeautifulSoup(content, "html.parser")
                return self._createResult(
                    success=True,
                    data={
                        "type": "html",
                        "title": soup.title.string if soup.title else None,
                        "text": soup.get_text(),
                        "links": [a.get("href") for a in soup.find_all("a", href=True)],
                        "images": [img.get("src") for img in soup.find_all("img", src=True)]
                    }
                )
            elif contentType == "json":
                data = json.loads(content)
                return self._createResult(
                    success=True,
                    data={
                        "type": "json",
                        "data": data
                    }
                )
            else:
                raise ValueError(f"Unsupported content type: {contentType}")
        except Exception as e:
            logger.error(f"Error parsing content: {e}")
            return self._createResult(
                success=False,
                data={"error": f"Parse failed: {str(e)}"}
            )

    async def _extractData(self, parameters: Dict[str, Any]) -> MethodResult:
        """Extract data from web content"""
        try:
            content = parameters["content"]
            contentType = parameters.get("contentType", "html")
            selectors = parameters["selectors"]

            if contentType == "html":
                soup = BeautifulSoup(content, "html.parser")
                results = {}

                for key, selector in selectors.items():
                    elements = soup.select(selector)
                    if len(elements) == 1:
                        results[key] = elements[0].get_text().strip()
                    else:
                        results[key] = [el.get_text().strip() for el in elements]

                return self._createResult(
                    success=True,
                    data={
                        "type": "html",
                        "results": results
                    }
                )
            elif contentType == "json":
                data = json.loads(content)
                results = {}

                for key, path in selectors.items():
                    value = data
                    for part in path.split("."):
                        if isinstance(value, dict):
                            value = value.get(part)
                        elif isinstance(value, list) and part.isdigit():
                            value = value[int(part)]
                        else:
                            value = None
                            break
                    results[key] = value

                return self._createResult(
                    success=True,
                    data={
                        "type": "json",
                        "results": results
                    }
                )
            else:
                raise ValueError(f"Unsupported content type: {contentType}")
        except Exception as e:
            logger.error(f"Error extracting data: {e}")
            return self._createResult(
                success=False,
                data={"error": f"Extract failed: {str(e)}"}
            )

    async def _search_web(self, parameters: Dict[str, Any]) -> MethodResult:
        """Search web content"""
        try:
            query = parameters["query"]
            maxResults = parameters.get("maxResults", 10)
            filters = parameters.get("filters", {})
            searchEngine = parameters.get("searchEngine", "google")

            # Implement search using different engines
            if searchEngine.lower() == "google":
                # Use Google Custom Search API
                # TODO: Implement Google Custom Search API integration
                results = await self._google_search(query, maxResults, filters)
            elif searchEngine.lower() == "bing":
                # Use Bing Web Search API
                # TODO: Implement Bing Web Search API integration
                results = await self._bing_search(query, maxResults, filters)
            else:
                return self._createResult(
                    success=False,
                    data={"error": f"Unsupported search engine: {searchEngine}"}
                )

            return self._createResult(
                success=True,
                data={
                    "query": query,
                    "engine": searchEngine,
                    "results": results
                }
            )
        except Exception as e:
            logger.error(f"Error searching web: {e}")
            return self._createResult(
                success=False,
                data={"error": f"Search failed: {str(e)}"}
            )

    async def _google_search(self, query: str, max_results: int, filters: Dict[str, Any]) -> list:
        """Search using Google Custom Search API"""
        # TODO: Implement Google Custom Search API
        # This is a placeholder implementation
        return [
            {
                "title": "Example Result",
                "url": "https://example.com",
                "snippet": "Example search result snippet",
                "source": "google"
            }
        ]

    async def _bing_search(self, query: str, max_results: int, filters: Dict[str, Any]) -> list:
        """Search using Bing Web Search API"""
        # TODO: Implement Bing Web Search API
        # This is a placeholder implementation
        return [
            {
                "title": "Example Result",
                "url": "https://example.com",
                "snippet": "Example search result snippet",
                "source": "bing"
            }
        ]

    async def _crawl_page(self, parameters: Dict[str, Any]) -> MethodResult:
        """Crawl web pages"""
        try:
            url = parameters["url"]
            depth = parameters.get("depth", 1)
            followLinks = parameters.get("followLinks", False)
            includeImages = parameters.get("includeImages", False)
            respectRobots = parameters.get("respectRobots", True)

            # Check robots.txt if required
            if respectRobots:
                if not await self._check_robots_txt(url):
                    return self._createResult(
                        success=False,
                        data={"error": "Crawling not allowed by robots.txt"}
                    )

            # Crawl the page
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    if response.status == 200:
                        html = await response.text()
                        soup = BeautifulSoup(html, 'html.parser')

                        # Extract basic information
                        result = {
                            "url": url,
                            "title": soup.title.string if soup.title else None,
                            "description": self._get_meta_description(soup),
                            "links": [],
                            "images": [] if includeImages else None,
                            "text": soup.get_text(strip=True),
                            "crawled": datetime.now(UTC).isoformat()
                        }

                        # Extract links if followLinks is True
                        if followLinks:
                            baseUrl = url
                            for link in soup.find_all('a'):
                                href = link.get('href')
                                if href:
                                    absoluteUrl = urljoin(baseUrl, href)
                                    if self._is_valid_url(absoluteUrl):
                                        result["links"].append({
                                            "url": absoluteUrl,
                                            "text": link.get_text(strip=True)
                                        })

                        # Extract images if includeImages is True
                        if includeImages:
                            for img in soup.find_all('img'):
                                src = img.get('src')
                                if src:
                                    absoluteSrc = urljoin(url, src)
                                    result["images"].append({
                                        "url": absoluteSrc,
                                        "alt": img.get('alt', ''),
                                        "title": img.get('title', '')
                                    })

                        return self._createResult(
                            success=True,
                            data=result
                        )
                    else:
                        return self._createResult(
                            success=False,
                            data={"error": f"Failed to fetch URL: {response.status}"}
                        )
        except Exception as e:
            logger.error(f"Error crawling page: {e}")
            return self._createResult(
                success=False,
                data={"error": f"Crawl failed: {str(e)}"}
            )

    def _get_meta_description(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract meta description from HTML"""
        metaDesc = soup.find('meta', attrs={'name': 'description'})
        if metaDesc:
            return metaDesc.get('content')
        return None

    def _is_valid_url(self, url: str) -> bool:
        """Check if URL is valid"""
        try:
            result = urlparse(url)
            return all([result.scheme, result.netloc])
        except:
            return False

    async def _check_robots_txt(self, url: str) -> bool:
        """Check if URL is allowed by robots.txt"""
        try:
            parsedUrl = urlparse(url)
            robotsUrl = f"{parsedUrl.scheme}://{parsedUrl.netloc}/robots.txt"

            async with aiohttp.ClientSession() as session:
                async with session.get(robotsUrl, headers={"User-Agent": self.userAgent}, timeout=self.timeout) as response:
                    if response.status == 200:
                        robotsContent = await response.text()

                        # Parse robots.txt content
                        userAgent = "*"  # Default to all user agents
                        disallowPaths = []

                        for line in robotsContent.splitlines():
                            line = line.strip().lower()
                            if line.startswith("user-agent:"):
                                userAgent = line[11:].strip()
                            elif line.startswith("disallow:") and userAgent in ["*", self.userAgent.lower()]:
                                path = line[9:].strip()
                                if path:
                                    disallowPaths.append(path)

                        # Check if URL path is disallowed
                        urlPath = parsedUrl.path
                        for disallowPath in disallowPaths:
                            if urlPath.startswith(disallowPath):
                                return False

                        return True
                    else:
                        # If robots.txt doesn't exist, assume crawling is allowed
                        return True

        except Exception as e:
            logger.warning(f"Error checking robots.txt for {url}: {str(e)}")
            # If there's an error, assume crawling is allowed
            return True

    def _detect_language(self, soup: BeautifulSoup) -> str:
        """Detect page language"""
        try:
            # Try to get language from HTML lang attribute
            if soup.html and soup.html.get('lang'):
                return soup.html.get('lang')

            # Try to get language from meta tag
            metaLang = soup.find('meta', attrs={'http-equiv': 'content-language'})
            if metaLang:
                return metaLang.get('content', 'en')

            # Try to get language from meta charset
            metaCharset = soup.find('meta', attrs={'charset': True})
            if metaCharset:
                charset = metaCharset.get('charset', '').lower()
                if 'utf-8' in charset:
                    return 'en'  # Default to English for UTF-8

            # Try to detect language from content
            # This is a simple heuristic based on common words
            text = soup.get_text().lower()
            commonWords = {
                'en': ['the', 'and', 'of', 'to', 'in', 'is', 'that', 'for', 'it', 'with'],
                'es': ['el', 'la', 'los', 'las', 'de', 'y', 'en', 'que', 'por', 'con'],
                'fr': ['le', 'la', 'les', 'de', 'et', 'en', 'que', 'pour', 'avec', 'dans'],
                'de': ['der', 'die', 'das', 'und', 'in', 'den', 'von', 'zu', 'für', 'mit']
            }

            wordCounts = {lang: sum(1 for word in words if f' {word} ' in f' {text} ')
                          for lang, words in commonWords.items()}

            if wordCounts:
                return max(wordCounts.items(), key=lambda x: x[1])[0]

            return 'en'  # Default to English if no language detected

        except Exception as e:
            logger.warning(f"Error detecting language: {str(e)}")
            return 'en'  # Default to English on error