gateway/modules/methods/methodWeb.py

"""
Web method module.
Handles web operations using the web service.
"""

import logging
from typing import Dict, Any, List, Optional
from datetime import datetime, UTC
import requests
from bs4 import BeautifulSoup
import time
import uuid
import json # Added for JSON parsing

from modules.chat.methodBase import MethodBase, ActionResult, action
from modules.shared.configuration import APP_CONFIG

logger = logging.getLogger(__name__)

class MethodWeb(MethodBase):
    """Web method implementation for web operations"""

    def __init__(self, serviceCenter: Any):
        """Initialize the web method"""
        super().__init__(serviceCenter)
        self.name = "web"
        self.description = "Handle web operations like crawling and scraping"

        # Web search configuration from agentWebcrawler
        self.srcApikey = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_APIKEY", "")
        self.srcEngine = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_ENGINE", "google")
        self.srcCountry = APP_CONFIG.get("Agent_Webcrawler_SERPAPI_COUNTRY", "auto")
        self.maxResults = int(APP_CONFIG.get("Agent_Webcrawler_SERPAPI_MAX_SEARCH_RESULTS", "5"))

        if not self.srcApikey:
            logger.warning("SerpAPI key not configured for web search")

        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        self.timeout = 30

    def _readUrl(self, url: str) -> BeautifulSoup:
        """Read a URL and return a BeautifulSoup parser for the content with enhanced error handling"""
        if not url or not url.startswith(('http://', 'https://')):
            logger.error(f"Invalid URL: {url}")
            return None

        # Enhanced headers to mimic real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Cache-Control': 'max-age=0'
        }

        try:
            # Use session for better connection handling
            session = requests.Session()
            session.headers.update(headers)

            # Initial request with allow_redirects
            response = session.get(url, timeout=self.timeout, allow_redirects=True)

            # Handle various status codes
            if response.status_code == 200:
                # Success - parse content
                logger.debug(f"Successfully read URL: {url}")
                return BeautifulSoup(response.text, 'html.parser')

            elif response.status_code == 202:
                # Accepted - retry with backoff
                logger.info(f"Status 202 for {url}, retrying with backoff...")
                backoff_times = [1.0, 2.0, 5.0, 10.0]

                for wait_time in backoff_times:
                    time.sleep(wait_time)
                    retry_response = session.get(url, timeout=self.timeout, allow_redirects=True)

                    if retry_response.status_code == 200:
                        logger.debug(f"Successfully read URL after retry: {url}")
                        return BeautifulSoup(retry_response.text, 'html.parser')
                    elif retry_response.status_code != 202:
                        break

                logger.warning(f"Failed to read URL after retries: {url}")
                return None

            elif response.status_code in [301, 302, 307, 308]:
                # Redirect - should be handled by allow_redirects=True
                logger.warning(f"Unexpected redirect status {response.status_code} for {url}")
                return None

            elif response.status_code == 403:
                # Forbidden - try with different user agent
                logger.warning(f"403 Forbidden for {url}, trying with different user agent...")
                headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
                session.headers.update(headers)

                retry_response = session.get(url, timeout=self.timeout, allow_redirects=True)
                if retry_response.status_code == 200:
                    logger.debug(f"Successfully read URL with different user agent: {url}")
                    return BeautifulSoup(retry_response.text, 'html.parser')
                else:
                    logger.error(f"Still getting {retry_response.status_code} for {url}")
                    return None

            elif response.status_code == 429:
                # Rate limited - wait and retry
                logger.warning(f"Rate limited for {url}, waiting 30 seconds...")
                time.sleep(30)
                retry_response = session.get(url, timeout=self.timeout, allow_redirects=True)
                if retry_response.status_code == 200:
                    logger.debug(f"Successfully read URL after rate limit: {url}")
                    return BeautifulSoup(retry_response.text, 'html.parser')
                else:
                    logger.error(f"Still getting {retry_response.status_code} after rate limit wait for {url}")
                    return None

            else:
                # Other error status codes
                logger.error(f"HTTP {response.status_code} for {url}")
                return None

        except requests.exceptions.Timeout:
            logger.error(f"Timeout reading URL: {url}")
            return None
        except requests.exceptions.ConnectionError:
            logger.error(f"Connection error reading URL: {url}")
            return None
        except requests.exceptions.RequestException as e:
            logger.error(f"Request error reading URL {url}: {str(e)}")
            return None
        except Exception as e:
            logger.error(f"Unexpected error reading URL {url}: {str(e)}")
            return None

    def _extractTitle(self, soup: BeautifulSoup, url: str) -> str:
        """Extract the title from a webpage"""
        if not soup:
            return f"Error with {url}"

        # Extract title from title tag
        title_tag = soup.find('title')
        title = title_tag.text.strip() if title_tag else "No title"

        # Alternative: Also look for h1 tags if title tag is missing
        if title == "No title":
            h1_tag = soup.find('h1')
            if h1_tag:
                title = h1_tag.text.strip()

        return title

    def _extractMainContent(self, soup: BeautifulSoup, max_chars: int = 50000) -> str:
        """Extract the main content from an HTML page with enhanced content detection"""
        if not soup:
            return ""

        # Try to find main content elements in priority order with more selectors
        main_content = None
        content_selectors = [
            'main',
            'article',
            '#content',
            '.content',
            '#main',
            '.main',
            '.post-content',
            '.entry-content',
            '.article-content',
            '.page-content',
            '[role="main"]',
            '.container',
            '.wrapper'
        ]

        for selector in content_selectors:
            content = soup.select_one(selector)
            if content:
                main_content = content
                logger.debug(f"Found main content using selector: {selector}")
                break

        # If no main content found, use the body
        if not main_content:
            main_content = soup.find('body') or soup
            logger.debug("Using body as main content")

        # Create a copy to avoid modifying the original
        content_copy = main_content.copy()

        # Remove elements that don't contribute to main content (less aggressive)
        elements_to_remove = [
            'script', 'style', 'noscript',
            'nav', 'footer', 'header', 'aside',
            '.sidebar', '#sidebar', '.comments', '#comments',
            '.advertisement', '.ads', '.ad', '.banner',
            'iframe', '.social-share', '.share-buttons',
            '.breadcrumb', '.breadcrumbs', '.pagination',
            '.related-posts', '.related-articles',
            '.newsletter', '.subscribe', '.signup',
            '.cookie-notice', '.privacy-notice',
            '.popup', '.modal', '.overlay'
        ]

        for selector in elements_to_remove:
            for element in content_copy.select(selector):
                element.extract()

        # Extract text content with better formatting
        text_content = content_copy.get_text(separator='\n', strip=True)

        # Clean up the text
        lines = text_content.split('\n')
        cleaned_lines = []

        for line in lines:
            line = line.strip()
            if line and len(line) > 10:  # Only keep meaningful lines
                cleaned_lines.append(line)

        # Join lines with proper spacing
        cleaned_content = '\n\n'.join(cleaned_lines)

        # If content is too short, try alternative extraction
        if len(cleaned_content) < 500:
            logger.debug("Content too short, trying alternative extraction...")

            # Try to extract from all paragraphs
            paragraphs = soup.find_all(['p', 'div', 'section'])
            alt_content = []

            for p in paragraphs:
                text = p.get_text(strip=True)
                if text and len(text) > 20:  # Only meaningful paragraphs
                    alt_content.append(text)

            if alt_content:
                cleaned_content = '\n\n'.join(alt_content[:20])  # Limit to first 20 paragraphs

        # Limit to max_chars but preserve complete sentences
        if len(cleaned_content) > max_chars:
            # Try to cut at a sentence boundary
            sentences = cleaned_content.split('. ')
            truncated_content = ""

            for sentence in sentences:
                if len(truncated_content + sentence) < max_chars:
                    truncated_content += sentence + ". "
                else:
                    break

            cleaned_content = truncated_content.strip()

        logger.debug(f"Extracted {len(cleaned_content)} characters of content")
        return cleaned_content

    def _checkAccessibility(self, soup: BeautifulSoup) -> Dict[str, Any]:
        """Check basic accessibility features"""
        issues = []
        warnings = []

        # Check for alt text on images
        images_without_alt = soup.find_all('img', alt='')
        if images_without_alt:
            issues.append(f"Found {len(images_without_alt)} images without alt text")

        # Check for proper heading structure
        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        if not headings:
            warnings.append("No headings found - poor document structure")

        # Check for form labels
        forms = soup.find_all('form')
        for form in forms:
            inputs = form.find_all('input')
            for input_elem in inputs:
                if input_elem.get('type') not in ['submit', 'button', 'hidden']:
                    if not input_elem.get('id') or not soup.find('label', attrs={'for': input_elem.get('id')}):
                        warnings.append("Form input without proper label")

        return {
            "status": "warning" if warnings else "pass",
            "issues": issues,
            "warnings": warnings
        }

    def _checkSEO(self, soup: BeautifulSoup) -> Dict[str, Any]:
        """Check basic SEO features"""
        issues = []
        warnings = []

        # Check for title tag
        title = soup.find('title')
        if not title:
            issues.append("Missing title tag")
        elif len(title.get_text()) < 10:
            warnings.append("Title tag is too short")
        elif len(title.get_text()) > 60:
            warnings.append("Title tag is too long")

        # Check for meta description
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if not meta_desc:
            warnings.append("Missing meta description")
        elif meta_desc.get('content'):
            if len(meta_desc.get('content')) < 50:
                warnings.append("Meta description is too short")
            elif len(meta_desc.get('content')) > 160:
                warnings.append("Meta description is too long")

        # Check for h1 tag
        h1_tags = soup.find_all('h1')
        if not h1_tags:
            warnings.append("No H1 tag found")
        elif len(h1_tags) > 1:
            warnings.append("Multiple H1 tags found")

        return {
            "status": "warning" if warnings else "pass",
            "issues": issues,
            "warnings": warnings
        }

    def _checkPerformance(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
        """Check basic performance indicators"""
        warnings = []

        # Count images
        images = soup.find_all('img')
        if len(images) > 20:
            warnings.append(f"Many images found ({len(images)}) - may impact loading speed")

        # Check for external resources
        external_scripts = soup.find_all('script', src=True)
        external_styles = soup.find_all('link', rel='stylesheet')

        if len(external_scripts) > 10:
            warnings.append(f"Many external scripts ({len(external_scripts)}) - may impact loading speed")

        if len(external_styles) > 5:
            warnings.append(f"Many external stylesheets ({len(external_styles)}) - may impact loading speed")

        return {
            "status": "warning" if warnings else "pass",
            "warnings": warnings,
            "metrics": {
                "images": len(images),
                "external_scripts": len(external_scripts),
                "external_styles": len(external_styles)
            }
        }

    def _detectJavaScriptRendering(self, soup: BeautifulSoup) -> bool:
        """Detect if a page likely requires JavaScript rendering"""
        if not soup:
            return False

        # Check for common indicators of JavaScript-rendered content
        indicators = [
            # Angular, React, Vue indicators
            soup.find('div', {'ng-app': True}),
            soup.find('div', {'id': 'root'}),
            soup.find('div', {'id': 'app'}),
            soup.find('div', {'id': 'react-root'}),

            # SPA indicators
            soup.find('div', {'id': 'spa-root'}),
            soup.find('div', {'class': 'spa-container'}),

            # Modern framework indicators
            soup.find('div', {'data-reactroot': True}),
            soup.find('div', {'data-ng-controller': True}),

            # Empty content with scripts
            len(soup.get_text(strip=True)) < 100 and len(soup.find_all('script')) > 2
        ]

        return any(indicators)

    def _extractMetaInformation(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
        """Extract meta information from the page"""
        meta_info = {
            "url": url,
            "title": self._extractTitle(soup, url),
            "description": "",
            "keywords": "",
            "author": "",
            "language": "",
            "robots": "",
            "viewport": "",
            "charset": "",
            "canonical": ""
        }

        # Extract meta tags
        meta_tags = soup.find_all('meta')
        for meta in meta_tags:
            name = meta.get('name', '').lower()
            property = meta.get('property', '').lower()
            content = meta.get('content', '')

            if name == 'description' or property == 'og:description':
                meta_info['description'] = content
            elif name == 'keywords':
                meta_info['keywords'] = content
            elif name == 'author':
                meta_info['author'] = content
            elif name == 'language':
                meta_info['language'] = content
            elif name == 'robots':
                meta_info['robots'] = content
            elif name == 'viewport':
                meta_info['viewport'] = content
            elif property == 'og:title':
                meta_info['title'] = content
            elif property == 'og:url':
                meta_info['canonical'] = content

        # Extract charset
        charset_meta = soup.find('meta', charset=True)
        if charset_meta:
            meta_info['charset'] = charset_meta.get('charset', '')

        # Extract canonical URL
        canonical_link = soup.find('link', rel='canonical')
        if canonical_link:
            meta_info['canonical'] = canonical_link.get('href', '')

        return meta_info

    def _getAlternativeApproaches(self, url: str, requires_js: bool, content_length: int) -> List[str]:
        """Get alternative approaches for sites that are difficult to crawl"""
        approaches = []

        if requires_js:
            approaches.extend([
                "Site requires JavaScript rendering - consider using a headless browser",
                "Try accessing the site's API endpoints directly",
                "Look for RSS feeds or sitemaps",
                "Check if the site has a mobile version that's easier to parse"
            ])

        if content_length < 100:
            approaches.extend([
                "Site may have anti-bot protection - try with different user agents",
                "Check if the site requires authentication",
                "Look for alternative URLs (www vs non-www, http vs https)",
                "Try accessing the site's robots.txt for crawling guidelines"
            ])

        # Add general suggestions
        approaches.extend([
            "Use the web.search action to find alternative sources",
            "Try the web.scrape action with specific CSS selectors",
            "Check if the site has a public API or data export"
        ])

        return approaches

    async def _tryAdvancedAIWebResearch(self, action_type: str, parameters: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """
        Try to get web research results using advanced AI first

        Args:
            action_type: Type of action ('crawl', 'scrape', or 'search')
            parameters: Action parameters

        Returns:
            Dict with AI results if successful, None if AI call fails
        """
        try:
            # Create appropriate prompt based on action type
            if action_type == "crawl":
                prompt = self._createCrawlAIPrompt(parameters)
            elif action_type == "scrape":
                prompt = self._createScrapeAIPrompt(parameters)
            elif action_type == "search":
                prompt = self._createSearchAIPrompt(parameters)
            else:
                logger.warning(f"Unknown action type for AI research: {action_type}")
                return None

            # Try advanced AI call
            if hasattr(self.service, 'callAiTextAdvanced'):
                logger.info(f"Attempting advanced AI web research for {action_type}")
                response = await self.service.callAiTextAdvanced(prompt)

                # Parse the AI response
                parsed_result = self._parseAIWebResponse(response, action_type)
                if parsed_result:
                    logger.info(f"Advanced AI web research successful for {action_type}")
                    return parsed_result
                else:
                    logger.warning(f"Failed to parse AI response for {action_type}")
                    return None
            else:
                logger.warning("Service does not have callAiTextAdvanced method")
                return None

        except Exception as e:
            logger.warning(f"Advanced AI web research failed for {action_type}: {str(e)}")
            return None

    def _createCrawlAIPrompt(self, parameters: Dict[str, Any]) -> str:
        """Create AI prompt for web crawling"""
        urls = parameters.get("urls", [])
        maxDepth = parameters.get("maxDepth", 2)
        includeImages = parameters.get("includeImages", False)
        followLinks = parameters.get("followLinks", True)

        prompt = f"""
You are an advanced AI research assistant with comprehensive knowledge about websites, companies, and online content. Please provide detailed information about the following URLs based on your extensive training data and knowledge.

URLs to research: {urls}
Max depth: {maxDepth}
Include images: {includeImages}
Follow links: {followLinks}

For each URL, please provide comprehensive information including:
1. Company/organization information and background
2. Main business activities and services
3. Key personnel and leadership
4. Contact information and locations
5. Recent news and developments
6. Industry analysis and market position
7. Related companies and partnerships
8. Website structure and key pages
9. Business model and revenue streams
10. Regulatory compliance and certifications

For each URL, provide:
- url: The original URL
- title: Company/organization name
- content: Comprehensive description and analysis
- content_length: Number of characters in content
- meta_info: Business information object
- links: Related companies and important connections
- images: Company logos or key visuals if known
- requires_javascript: Boolean (usually false for static info)
- alternative_approaches: Additional research suggestions
- timestamp: Current timestamp

Return the results in this exact JSON format:
{{
    "urls": {urls},
    "maxDepth": {maxDepth},
    "includeImages": {includeImages},
    "followLinks": {followLinks},
    "crawlResults": [
        {{
            "url": "url_here",
            "depth": {maxDepth},
            "followLinks": {followLinks},
            "extractContent": true,
            "title": "company_name",
            "content": "comprehensive_company_analysis",
            "content_length": 1234,
            "meta_info": {{
                "url": "url_here",
                "title": "company_name",
                "description": "business_description",
                "keywords": "industry_keywords",
                "author": "company_info",
                "language": "language_code",
                "robots": "robots_info",
                "viewport": "viewport_info",
                "charset": "charset_info",
                "canonical": "canonical_url"
            }},
            "links": [
                {{
                    "url": "related_company_url",
                    "text": "company_name"
                }}
            ],
            "images": [
                {{
                    "src": "logo_url",
                    "alt": "company_logo",
                    "title": "company_name",
                    "width": "width_value",
                    "height": "height_value"
                }}
            ],
            "requires_javascript": false,
            "alternative_approaches": ["approach1", "approach2"],
            "timestamp": "2024-01-01T00:00:00Z"
        }}
    ],
    "summary": {{
        "total_urls": {len(urls)},
        "successful_crawls": 0,
        "failed_crawls": 0,
        "total_content_chars": 0
    }},
    "timestamp": "2024-01-01T00:00:00Z"
}}

Please provide accurate, comprehensive information about each company/organization based on your knowledge. If you don't have specific information about a URL, provide general industry analysis and suggest alternative research approaches.
"""
        return prompt

    def _createScrapeAIPrompt(self, parameters: Dict[str, Any]) -> str:
        """Create AI prompt for web scraping"""
        url = parameters.get("url")
        selectors = parameters.get("selectors", {})
        format = parameters.get("format", "json")

        prompt = f"""
You are an advanced AI research assistant with comprehensive knowledge about websites, companies, and online content. Please provide detailed information about the following URL and the specific data requested based on your extensive training data and knowledge.

URL to research: {url}
Data selectors: {selectors}
Output format: {format}

Please provide comprehensive information including:
1. Company/organization background and history
2. Business activities and services offered
3. Key personnel and leadership information
4. Financial information and performance data
5. Market position and competitive analysis
6. Recent news and developments
7. Contact information and locations
8. Industry trends and insights
9. Related companies and partnerships
10. Regulatory and compliance information

For each data selector requested, provide relevant information in the specified format (text, html, or json).

Return the results in this exact JSON format:
{{
    "url": "{url}",
    "selectors": {selectors},
    "format": "{format}",
    "scrapedData": {{
        "url": "{url}",
        "selectors": {selectors},
        "format": "{format}",
        "content": {{
            "company_info": ["comprehensive_company_analysis"],
            "business_activities": ["detailed_business_description"],
            "leadership": ["key_personnel_information"],
            "financial_data": ["financial_performance_analysis"],
            "market_position": ["competitive_analysis"],
            "recent_news": ["latest_developments"],
            "contact_info": ["contact_details"],
            "industry_insights": ["market_trends"],
            "partnerships": ["related_companies"],
            "compliance": ["regulatory_information"]
        }},
        "timestamp": "2024-01-01T00:00:00Z"
    }},
    "timestamp": "2024-01-01T00:00:00Z"
}}

Please provide accurate, comprehensive information about the company/organization based on your knowledge. If you don't have specific information about the URL, provide general industry analysis and suggest alternative research approaches.
"""
        return prompt

    def _createSearchAIPrompt(self, parameters: Dict[str, Any]) -> str:
        """Create AI prompt for web search"""
        query = parameters.get("query")
        engine = parameters.get("engine", "google")
        maxResults = parameters.get("maxResults", 10)
        filter = parameters.get("filter")

        prompt = f"""
You are an advanced AI research assistant with comprehensive knowledge about companies, industries, and business information. Please provide detailed information about the following search query based on your extensive training data and knowledge.

Search query: {query}
Search engine: {engine}
Max results: {maxResults}
Filter: {filter}

Please provide comprehensive research results including:
1. Relevant company/organization information
2. Industry analysis and market insights
3. Key personnel and leadership details
4. Business activities and services
5. Financial performance and metrics
6. Recent news and developments
7. Competitive landscape analysis
8. Market trends and opportunities
9. Regulatory and compliance information
10. Related companies and partnerships

For each search result, provide:
- title: Company/organization name
- url: Official website or primary source
- snippet: Brief description and key highlights
- content: Comprehensive analysis and insights

Return the results in this exact JSON format:
{{
    "query": "{query}",
    "engine": "{engine}",
    "maxResults": {maxResults},
    "filter": "{filter}",
    "searchResults": {{
        "query": "{query}",
        "maxResults": {maxResults},
        "results": [
            {{
                "title": "company_name",
                "url": "official_website",
                "snippet": "brief_description",
                "content": "comprehensive_analysis"
            }}
        ],
        "totalFound": 0,
        "timestamp": "2024-01-01T00:00:00Z"
    }},
    "timestamp": "2024-01-01T00:00:00Z"
}}

Please provide accurate, comprehensive information about the search query based on your knowledge. If you don't have specific information about the query, provide general industry analysis and suggest alternative research approaches.
"""
        return prompt

    def _parseAIWebResponse(self, response: str, action_type: str) -> Optional[Dict[str, Any]]:
        """Parse AI response into structured data"""
        try:
            # Extract JSON from response
            json_start = response.find('{')
            json_end = response.rfind('}') + 1
            if json_start == -1 or json_end == 0:
                logger.warning(f"No JSON found in AI response: {response}")
                return None

            json_str = response[json_start:json_end]
            parsed_data = json.loads(json_str)

            # Validate basic structure based on action type
            if action_type == "crawl":
                if "crawlResults" not in parsed_data:
                    logger.warning("Invalid crawl response structure")
                    return None
            elif action_type == "scrape":
                if "scrapedData" not in parsed_data:
                    logger.warning("Invalid scrape response structure")
                    return None
            elif action_type == "search":
                if "searchResults" not in parsed_data:
                    logger.warning("Invalid search response structure")
                    return None

            return parsed_data

        except json.JSONDecodeError as e:
            logger.warning(f"Failed to parse AI response JSON: {str(e)}")
            return None
        except Exception as e:
            logger.warning(f"Error parsing AI response: {str(e)}")
            return None

    @action
    async def crawl(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        Crawl web pages and extract content with enhanced error handling and content detection

        Parameters:
            urls (List[str]): List of URLs to crawl
            maxDepth (int, optional): Maximum crawl depth (default: 2)
            includeImages (bool, optional): Whether to include images (default: False)
            followLinks (bool, optional): Whether to follow links (default: True)
            expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
        """
        try:
            urls = parameters.get("urls")
            maxDepth = parameters.get("maxDepth", 2)
            includeImages = parameters.get("includeImages", False)
            followLinks = parameters.get("followLinks", True)
            expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])

            if not urls:
                return self._createResult(
                    success=False,
                    data={},
                    error="URLs are required"
                )

            # Try advanced AI research first
            ai_result = await self._tryAdvancedAIWebResearch("crawl", parameters)
            if ai_result:
                logger.info("Using advanced AI web research for crawl")
                # Reconstruct the result data from the AI response
                result_data = {
                    "urls": ai_result.get("urls", []),
                    "maxDepth": ai_result.get("maxDepth", 2),
                    "includeImages": ai_result.get("includeImages", False),
                    "followLinks": ai_result.get("followLinks", True),
                    "crawlResults": ai_result.get("crawlResults", []),
                    "summary": ai_result.get("summary", {}),
                    "timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat())
                }
                return self._createResult(
                    success=True,
                    data={
                        "documents": [
                            {
                                "documentName": f"web_crawl_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
                                "documentData": result_data,
                                "mimeType": "application/json"
                            }
                        ]
                    }
                )
            else:
                logger.info("Advanced AI web research failed, falling back to regular web crawling")

            # Crawl each URL
            crawl_results = []

            for url in urls:
                try:
                    logger.info(f"Crawling URL: {url}")

                    # Read the URL with enhanced error handling
                    soup = self._readUrl(url)
                    if not soup:
                        logger.error(f"Failed to read URL: {url}")
                        crawl_results.append({
                            "error": "Failed to read URL - check if the site is accessible and not blocking crawlers",
                            "url": url,
                            "suggestions": [
                                "Try accessing the URL directly in a browser",
                                "Check if the site requires JavaScript",
                                "Verify the URL is correct and accessible"
                            ]
                        })
                        continue

                    # Extract comprehensive information
                    title = self._extractTitle(soup, url)
                    content = self._extractMainContent(soup)
                    meta_info = self._extractMetaInformation(soup, url)

                    # Check if content is meaningful
                    content_length = len(content)
                    if content_length < 100:
                        logger.warning(f"Very little content extracted from {url} ({content_length} chars)")
                        crawl_results.append({
                            "url": url,
                            "title": title,
                            "content": content,
                            "content_length": content_length,
                            "warning": "Very little content extracted - site may require JavaScript or have anti-bot protection",
                            "meta_info": meta_info,
                            "timestamp": datetime.now(UTC).isoformat()
                        })
                        continue

                    # Extract links if requested
                    links = []
                    if followLinks:
                        for link in soup.find_all('a', href=True):
                            href = link.get('href')
                            if href and href.startswith(('http://', 'https://')):
                                link_text = link.get_text(strip=True)
                                if link_text:  # Only include links with text
                                    links.append({
                                        'url': href,
                                        'text': link_text[:100]
                                    })

                    # Extract images if requested
                    images = []
                    if includeImages:
                        for img in soup.find_all('img', src=True):
                            src = img.get('src')
                            if src:
                                images.append({
                                    'src': src,
                                    'alt': img.get('alt', ''),
                                    'title': img.get('title', ''),
                                    'width': img.get('width', ''),
                                    'height': img.get('height', '')
                                })

                    # Check for JavaScript rendering requirements
                    requires_js = self._detectJavaScriptRendering(soup)

                    # Get alternative approaches if needed
                    alternative_approaches = self._getAlternativeApproaches(url, requires_js, content_length)

                    crawl_results.append({
                        "url": url,
                        "depth": maxDepth,
                        "followLinks": followLinks,
                        "extractContent": True,
                        "title": title,
                        "content": content,
                        "content_length": content_length,
                        "meta_info": meta_info,
                        "links": links[:20],  # Limit to first 20 links
                        "images": images[:20],  # Limit to first 20 images
                        "requires_javascript": requires_js,
                        "alternative_approaches": alternative_approaches,
                        "timestamp": datetime.now(UTC).isoformat()
                    })

                    logger.info(f"Successfully crawled {url} - extracted {content_length} characters")

                except Exception as e:
                    logger.error(f"Error crawling web page {url}: {str(e)}")
                    crawl_results.append({
                        "error": str(e),
                        "url": url,
                        "suggestions": [
                            "Check if the URL is accessible",
                            "Try with a different user agent",
                            "Verify the site doesn't block automated access"
                        ]
                    })

            # Create result data
            result_data = {
                "urls": urls,
                "maxDepth": maxDepth,
                "includeImages": includeImages,
                "followLinks": followLinks,
                "crawlResults": crawl_results,
                "summary": {
                    "total_urls": len(urls),
                    "successful_crawls": len([r for r in crawl_results if "error" not in r]),
                    "failed_crawls": len([r for r in crawl_results if "error" in r]),
                    "total_content_chars": sum([r.get("content_length", 0) for r in crawl_results if "content_length" in r])
                },
                "timestamp": datetime.now(UTC).isoformat()
            }

            # Determine output format based on expected formats
            output_extension = ".json"  # Default
            output_mime_type = "application/json"  # Default

            if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
                # Use the first expected format
                expected_format = expectedDocumentFormats[0]
                output_extension = expected_format.get("extension", ".json")
                output_mime_type = expected_format.get("mimeType", "application/json")
                logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
            else:
                logger.info("No expected format specified, using default .json format")

            return self._createResult(
                success=True,
                data={
                    "documents": [
                        {
                            "documentName": f"web_crawl_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
                            "documentData": result_data,
                            "mimeType": output_mime_type
                        }
                    ]
                }
            )

        except Exception as e:
            logger.error(f"Error crawling web pages: {str(e)}")
            return self._createResult(
                success=False,
                data={},
                error=str(e)
            )

    @action
    async def scrape(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        Scrape specific data from web pages

        Parameters:
            url (str): URL to scrape
            selectors (Dict[str, str]): CSS selectors for data extraction
            format (str, optional): Output format (default: "json")
            expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
        """
        try:
            url = parameters.get("url")
            selectors = parameters.get("selectors")
            format = parameters.get("format", "json")
            expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])

            if not url or not selectors:
                return self._createResult(
                    success=False,
                    data={},
                    error="URL and selectors are required"
                )

            # Try advanced AI research first
            ai_result = await self._tryAdvancedAIWebResearch("scrape", parameters)
            if ai_result:
                logger.info("Using advanced AI web research for scrape")
                # Reconstruct the result data from the AI response
                result_data = {
                    "url": ai_result.get("url"),
                    "selectors": ai_result.get("selectors"),
                    "format": ai_result.get("format"),
                    "scrapedData": ai_result.get("scrapedData"),
                    "timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat())
                }
                return self._createResult(
                    success=True,
                    data={
                        "documents": [
                            {
                                "documentName": f"web_scrape_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
                                "documentData": result_data,
                                "mimeType": "application/json"
                            }
                        ]
                    }
                )
            else:
                logger.info("Advanced AI web research failed, falling back to regular web scraping")

            # Read the URL
            soup = self._readUrl(url)
            if not soup:
                return self._createResult(
                    success=False,
                    data={},
                    error="Failed to read URL"
                )

            extracted_content = {}

            if selectors:
                # Extract content using provided selectors
                for selector_name, selector in selectors.items():
                    elements = soup.select(selector)
                    if elements:
                        if format == "text":
                            extracted_content[selector_name] = [elem.get_text(strip=True) for elem in elements]
                        elif format == "html":
                            extracted_content[selector_name] = [str(elem) for elem in elements]
                        else:
                            extracted_content[selector_name] = [elem.get_text(strip=True) for elem in elements]
                    else:
                        extracted_content[selector_name] = []
            else:
                # Auto-extract common elements
                extracted_content = {
                    "title": self._extractTitle(soup, url),
                    "main_content": self._extractMainContent(soup),
                    "headings": [h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3'])],
                    "links": [a.get('href') for a in soup.find_all('a', href=True) if a.get('href').startswith(('http://', 'https://'))],
                    "images": [img.get('src') for img in soup.find_all('img', src=True)]
                }

            scrape_result = {
                "url": url,
                "selectors": selectors,
                "format": format,
                "content": extracted_content,
                "timestamp": datetime.now(UTC).isoformat()
            }

            # Create result data
            result_data = {
                "url": url,
                "selectors": selectors,
                "format": format,
                "scrapedData": scrape_result,
                "timestamp": datetime.now(UTC).isoformat()
            }

            # Determine output format based on expected formats
            output_extension = f".{format}"  # Default to format parameter
            output_mime_type = "application/json"  # Default

            if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
                # Use the first expected format
                expected_format = expectedDocumentFormats[0]
                output_extension = expected_format.get("extension", f".{format}")
                output_mime_type = expected_format.get("mimeType", "application/json")
                logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
            else:
                logger.info(f"No expected format specified, using format parameter: {format}")

            return self._createResult(
                success=True,
                data={
                    "documents": [
                        {
                            "documentName": f"web_scrape_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
                            "documentData": result_data,
                            "mimeType": output_mime_type
                        }
                    ]
                }
            )

        except Exception as e:
            logger.error(f"Error scraping web page: {str(e)}")
            return self._createResult(
                success=False,
                data={},
                error=str(e)
            )

    @action
    async def search(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        Search web content

        Parameters:
            query (str): Search query
            engine (str, optional): Search engine to use (default: "google")
            maxResults (int, optional): Maximum number of results (default: 10)
            filter (str, optional): Additional search filters
            expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
        """
        try:
            query = parameters.get("query")
            engine = parameters.get("engine", "google")
            maxResults = parameters.get("maxResults", 10)
            filter = parameters.get("filter")
            expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])

            if not query:
                return self._createResult(
                    success=False,
                    data={},
                    error="Search query is required"
                )

            # Try advanced AI research first
            ai_result = await self._tryAdvancedAIWebResearch("search", parameters)
            if ai_result:
                logger.info("Using advanced AI web research for search")
                # Reconstruct the result data from the AI response
                result_data = {
                    "query": ai_result.get("query"),
                    "engine": ai_result.get("engine"),
                    "maxResults": ai_result.get("maxResults"),
                    "filter": ai_result.get("filter"),
                    "searchResults": ai_result.get("searchResults"),
                    "timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat())
                }
                return self._createResult(
                    success=True,
                    data={
                        "documents": [
                            {
                                "documentName": f"web_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
                                "documentData": result_data,
                                "mimeType": "application/json"
                            }
                        ]
                    }
                )
            else:
                logger.info("Advanced AI web research failed, falling back to regular web search")

            # Search web content using Google search via SerpAPI
            try:
                if not self.srcApikey:
                    search_result = {
                        "error": "SerpAPI key not configured",
                        "query": query
                    }
                else:
                    # Get user language from service center if available
                    userLanguage = "en"  # Default language
                    if hasattr(self.service, 'user') and hasattr(self.service.user, 'language'):
                        userLanguage = self.service.user.language

                    # Format the search request for SerpAPI
                    params = {
                        "engine": self.srcEngine,
                        "q": query,
                        "api_key": self.srcApikey,
                        "num": min(maxResults, self.maxResults),  # Number of results to return
                        "hl": userLanguage  # User language
                    }

                    # Make the API request
                    response = requests.get("https://serpapi.com/search", params=params, timeout=self.timeout)
                    response.raise_for_status()

                    # Parse JSON response
                    search_results = response.json()

                    # Extract organic results
                    results = []

                    if "organic_results" in search_results:
                        for result in search_results["organic_results"][:maxResults]:
                            # Extract title
                            title = result.get("title", "No title")

                            # Extract URL
                            url = result.get("link", "No URL")

                            # Extract snippet
                            snippet = result.get("snippet", "No description")

                            # Get actual page content
                            try:
                                targetPageSoup = self._readUrl(url)
                                content = self._extractMainContent(targetPageSoup)
                            except Exception as e:
                                logger.warning(f"Error extracting content from {url}: {str(e)}")
                                content = f"Error extracting content: {str(e)}"

                            results.append({
                                'title': title,
                                'url': url,
                                'snippet': snippet,
                                'content': content
                            })

                            # Limit number of results
                            if len(results) >= maxResults:
                                break
                    else:
                        logger.warning(f"No organic results found in SerpAPI response for: {query}")

                    search_result = {
                        "query": query,
                        "maxResults": maxResults,
                        "results": results,
                        "totalFound": len(results),
                        "timestamp": datetime.now(UTC).isoformat()
                    }

            except Exception as e:
                logger.error(f"Error searching web: {str(e)}")
                search_result = {
                    "error": str(e),
                    "query": query
                }

            # Create result data
            result_data = {
                "query": query,
                "engine": engine,
                "maxResults": maxResults,
                "filter": filter,
                "searchResults": search_result,
                "timestamp": datetime.now(UTC).isoformat()
            }

            # Determine output format based on expected formats
            output_extension = ".json"  # Default
            output_mime_type = "application/json"  # Default

            if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
                # Use the first expected format
                expected_format = expectedDocumentFormats[0]
                output_extension = expected_format.get("extension", ".json")
                output_mime_type = expected_format.get("mimeType", "application/json")
                logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
            else:
                logger.info("No expected format specified, using default .json format")

            return self._createResult(
                success=True,
                data={
                    "documents": [
                        {
                            "documentName": f"web_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
                            "documentData": result_data,
                            "mimeType": output_mime_type
                        }
                    ]
                }
            )

        except Exception as e:
            logger.error(f"Error searching web: {str(e)}")
            return self._createResult(
                success=False,
                data={},
                error=str(e)
            )