gateway/modules/agentservice_agent_webcrawler.py

"""
WebCrawler-Agent for research and retrieval of information from the web.
Adapted for the refactored Core-Module with language-agnostic detection.
"""

import json
import logging
import time
import traceback
from typing import List, Dict, Any, Optional
from urllib.parse import quote_plus, unquote

from bs4 import BeautifulSoup
import requests
from modules.agentservice_base import BaseAgent
from modules.agentservice_utils import MessageUtils, LoggingUtils
from modules.agentservice_protocol import AgentCommunicationProtocol
from modules.utility import APP_CONFIG

logger = logging.getLogger(__name__)

class WebcrawlerAgent(BaseAgent):

    """Agent for Web Research and Information Retrieval"""

    def __init__(self):
        """Initialize the WebCrawler Agent"""
        super().__init__()
        self.id = "webcrawler"
        self.name = "Webscraper"
        self.type = "scraper"
        self.description = "Researches information on the web"
        self.capabilities = "web_search,information_retrieval,data_collection,source_verification,content_integration"
        self.result_format = "SearchResults"

        # Add enhanced document capabilities
        self.supports_documents = True
        self.document_capabilities = ["read", "create"]
        self.required_context = ["workflow_id"]
        self.document_handler = None

        # Initialize AI service
        self.ai_service = None

        # Initialize protocol
        self.protocol = AgentCommunicationProtocol()

        # Initialize utility classes
        self.message_utils = MessageUtils()

        # Web-Crawling configuration
        self.max_url = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_URLS"))
        self.max_key = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_SEARCH_KEYWORDS"))
        self.max_result = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_SEARCH_RESULTS"))
        self.timeout = int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT"))

    def get_agent_info(self) -> Dict[str, Any]:
        """Get agent information for agent registry"""
        info = super().get_agent_info()
        info.update({
            "metadata": {
                "max_url": self.max_url,
                "max_result": self.max_result,
                "timeout": self.timeout
            }
        })
        return info

    def set_document_handler(self, document_handler):
        """Set the document handler for file operations"""
        self.document_handler = document_handler

    async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
        """
        Process a message and conduct web research if appropriate.

        Args:
            message: The message to process
            context: Additional context

        Returns:
            The generated response or rejection if not a web research request
        """
        # Extract workflow_id from context or message
        workflow_id = context.get("workflow_id") if context else message.get("workflow_id", "unknown")

        # Get or create logging_utils
        log_func = context.get("log_func") if context else None
        logging_utils = LoggingUtils(workflow_id, log_func)

        # Create response structure
        response = {
            "role": "assistant",
            "content": "",
            "agent_id": self.id,
            "agent_type": self.type,
            "agent_name": self.name,
            "result_format": self.result_format,
            "workflow_id": workflow_id
        }

        try:
            # Get the query from the message
            prompt = await self.get_prompt(message)

            # Check if this is explicitly a web research request using AI
            is_web_research = await self._is_web_research_request_ai(prompt)

            if not is_web_research:
                # Reject non-web research requests
                logging_utils.info("Request rejected: not a web research task", "agents")
                response["content"] = "This request doesn't appear to require web research. Redirecting to a more appropriate agent."
                response["status"] = "rejected"
                response["rejection_reason"] = "not_web_research"
                return response

            # Continue with web research process
            logging_utils.info(f"Web research for: {prompt[:50]}...", "agents")

            # Send status update using protocol
            if log_func:
                status_message = self.protocol.create_status_update_message(
                    status_description="Starting web research",
                    sender_id=self.id,
                    status="in_progress",
                    progress=0.0,
                    context_id=workflow_id
                )
                log_func(workflow_id, status_message.content, "info", self.id, self.name)

            # Update progress using protocol - 10% for starting the query analysis
            if log_func:
                status_message = self.protocol.create_status_update_message(
                    status_description=f"Analyzing search strategy for: {prompt[:30]}...",
                    sender_id=self.id,
                    status="in_progress",
                    progress=0.1,
                    context_id=workflow_id
                )
                log_func(workflow_id, status_message.content, "info", self.id, self.name)

            # Prepare the web query strategy
            try:
                # Log progress - 20% for query strategy preparation
                if log_func:
                    status_message = self.protocol.create_status_update_message(
                        status_description="Creating search strategy",
                        sender_id=self.id,
                        status="in_progress",
                        progress=0.2,
                        context_id=workflow_id
                    )
                    log_func(workflow_id, status_message.content, "info", self.id, self.name)

                # Get the query strategy
                content_text = await self.ai_service.call_api(
                    messages=[
                        {
                            "role": "system",
                            "content": "You are a web research expert who develops precise search strategies."
                        },
                        {
                            "role": "user",
                            "content": f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.

                        'url': A list of maximum {self.max_url} specific URLs extracted from the task string.

                        'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.

                        Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
                        """
                        }
                    ]
                )

                # Try to parse the JSON result
                if content_text.startswith("```json"):
                    # Find the end of the JSON block
                    end_marker = "```"
                    end_index = content_text.rfind(end_marker)
                    if end_index != -1:
                        # Extract the JSON content without the markdown markers
                        content_text = content_text[7:end_index].strip()

                try:
                    logger.info(f"Valid json received: {str(content_text)}")
                    pjson = json.loads(content_text)

                    # Log parsed search strategy
                    search_keys = pjson.get("skey", [])
                    search_urls = pjson.get("url", [])

                    if search_keys:
                        logging_utils.info(f"Searching for {len(search_keys)} key terms: {', '.join(search_keys[:2])}...", "agents")

                    if search_urls:
                        logging_utils.info(f"Searching in {len(search_urls)} direct URLs: {', '.join(search_urls[:2])}...", "agents")

                    # Log progress - 30% for starting the search
                    if log_func:
                        status_message = self.protocol.create_status_update_message(
                            status_description="Starting web search",
                            sender_id=self.id,
                            status="in_progress",
                            progress=0.3,
                            context_id=workflow_id
                        )
                        log_func(workflow_id, status_message.content, "info", self.id, self.name)

                    # Execute the search
                    results = []
                    total_tasks = len(search_keys) + len(search_urls)
                    tasks_completed = 0

                    # Process search keywords
                    for keyword in search_keys:
                        logging_utils.info(f"Searching web for: '{keyword}'", "agents")

                        # Log specific keyword search progress
                        if log_func:
                            progress_pct = 0.3 + (0.5 * (tasks_completed / total_tasks))
                            status_message = self.protocol.create_status_update_message(
                                status_description=f"Searching for: '{keyword}'",
                                sender_id=self.id,
                                status="in_progress",
                                progress=progress_pct,
                                context_id=workflow_id
                            )
                            log_func(workflow_id, status_message.content, "info", self.id, self.name)

                        keyword_results = self.search_web(keyword)
                        results.extend(keyword_results)
                        logging_utils.info(f"Found: {len(keyword_results)} results for '{keyword}'", "agents")

                        tasks_completed += 1

                    # Process direct URLs
                    for url in search_urls:
                        logging_utils.info(f"Extracting content from: {url}", "agents")

                        # Log specific URL extraction progress
                        if log_func:
                            progress_pct = 0.3 + (0.5 * (tasks_completed / total_tasks))
                            status_message = self.protocol.create_status_update_message(
                                status_description=f"Reading URL: {url}",
                                sender_id=self.id,
                                status="in_progress",
                                progress=progress_pct,
                                context_id=workflow_id
                            )
                            log_func(workflow_id, status_message.content, "info", self.id, self.name)

                        soup = self.read_url(url)

                        # Extract title from the page if it exists
                        if isinstance(soup, BeautifulSoup):
                            title_tag = soup.find('title')
                            title = title_tag.text.strip() if title_tag else "No title"

                            # Alternative: You could also look for h1 tags if the title tag is missing
                            if title == "No title":
                                h1_tag = soup.find('h1')
                                if h1_tag:
                                    title = h1_tag.text.strip()
                        else:
                            # Handle the case where soup is an error message string
                            title = "Error fetching page"

                        result = self.parse_result(soup, title, url)
                        results.append(result)
                        logging_utils.info(f"Extracted: '{title}' from {url}", "agents")

                        tasks_completed += 1

                    # Log progress - 80% for processing results
                    if log_func:
                        status_message = self.protocol.create_status_update_message(
                            status_description=f"Analyzing {len(results)} search results",
                            sender_id=self.id,
                            status="in_progress",
                            progress=0.8,
                            context_id=workflow_id
                        )
                        log_func(workflow_id, status_message.content, "info", self.id, self.name)

                    # Process results for the final output
                    logging_utils.info(f"Analyzing {len(results)} web results", "agents")

                    # Generate summaries for each result
                    processed_results = []
                    for i, result in enumerate(results):
                        result_data_limited = self.limit_text_for_api(result['data'], max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))

                        # Log individual result processing
                        logging_utils.info(f"Analyzing result {i+1}/{len(results)}: {result['title'][:30]}...", "agents")

                        web_answer_instructions = f"""
                        Summarize this search result according to the original request in approximately 2000 characters. Original request = '{prompt.replace("'","")}'
                        Focus on the most important insights and connect them to the original request. You can skip any introduction.
                        Extract only relevant and high-quality information related to the request, and present it in a clear format. Provide a balanced view of the researched information.

                        Here is the search result:
                        {result_data_limited}
                        """

                        content_summary = await self.ai_service.call_api(
                            messages=[
                                {
                                    "role": "system",
                                    "content": "You are an information analyst who precisely and relevantly summarizes web content."
                                },
                                {
                                    "role": "user",
                                    "content": web_answer_instructions
                                }
                            ]
                        )

                        # Limit summary to ~2000 characters
                        content_summary = content_summary[:2000]

                        processed_result = {
                            "title": result['title'],
                            "url": result['url'],
                            "snippet": result['snippet'],
                            "summary": content_summary
                        }

                        processed_results.append(processed_result)

                    # Log progress - 90% for creating final summary
                    if log_func:
                        status_message = self.protocol.create_status_update_message(
                            status_description="Creating overall summary",
                            sender_id=self.id,
                            status="in_progress",
                            progress=0.9,
                            context_id=workflow_id
                        )
                        log_func(workflow_id, status_message.content, "info", self.id, self.name)

                    # Create the final combined summary
                    all_summaries = "\n\n".join([r["summary"] for r in processed_results])
                    all_summaries_limited = self.limit_text_for_api(all_summaries, max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))

                    logging_utils.info("Creating overall summary of web research", "agents")

                    final_summary = await self.ai_service.call_api(
                        messages=[
                            {
                                "role": "system",
                                "content": "You create concise summaries of research findings."
                            },
                            {
                                "role": "user",
                                "content": f"Please summarize these findings in 5-6 sentences: {all_summaries_limited}\n"
                            }
                        ]
                    )

                    # Get the language of the request to use for result headers
                    request_language_analysis = await self.ai_service.call_api(
                        messages=[
                            {
                                "role": "system",
                                "content": "You determine the language of a text and return only the language name."
                            },
                            {
                                "role": "user",
                                "content": f"What language is this text in? Only respond with the language name: {prompt}"
                            }
                        ]
                    )

                    # Get headers in the right language
                    headers = await self._get_localized_headers(request_language_analysis.strip())

                    # Format the final result
                    final_result = f"## {headers['web_research_results']}\n\n### {headers['summary']}\n{final_summary}\n\n### {headers['detailed_results']}\n"

                    for i, result in enumerate(processed_results, 1):
                        final_result += f"\n\n[{i}] {result['title']}\n{headers['url']}: {result['url']}\n{headers['snippet']}: {result['snippet']}\n{headers['content']}: {result['summary']}"

                    # Set the content in the response
                    response["content"] = final_result

                    # Log completion - 100% progress
                    if log_func:
                        status_message = self.protocol.create_status_update_message(
                            status_description="Web research completed",
                            sender_id=self.id,
                            status="completed",
                            progress=1.0,
                            context_id=workflow_id
                        )
                        log_func(workflow_id, status_message.content, "info", self.id, self.name)

                    logging_utils.info("Web research successfully completed", "agents")

                    return response

                except json.JSONDecodeError as e:
                    logging_utils.error(f"Error parsing JSON data: {e}", "error")

                    # Fallback for JSON parse error
                    if log_func:
                        status_message = self.protocol.create_status_update_message(
                            status_description=f"Error parsing search strategy: {str(e)}",
                            sender_id=self.id,
                            status="error",
                            progress=0.0,
                            context_id=workflow_id
                        )
                        log_func(workflow_id, status_message.content, "error", self.id, self.name)

                    # Use a simple fallback approach
                    logging_utils.info("Using fallback search strategy with direct query", "agents")

                    # Perform a direct search with the original query
                    results = self.search_web(prompt)

                    # Process and format results directly
                    if results:
                        result_text = "## Web Research Results (Fallback Mode)\n\n"

                        for i, result in enumerate(results, 1):
                            result_text += f"### [{i}] {result['title']}\n"
                            result_text += f"URL: {result['url']}\n"
                            result_text += f"Snippet: {result['snippet']}\n\n"

                        response["content"] = result_text
                    else:
                        response["content"] = "## Web Research Results\n\nNo relevant results were found."

                    return response

            except Exception as e:
                error_msg = f"Error during web research: {str(e)}"
                logging_utils.error(error_msg, "error")

                # Create error response using protocol
                error_message = self.protocol.create_error_message(
                    error_description=error_msg,
                    sender_id=self.id,
                    error_type="web_search",
                    error_details={"traceback": traceback.format_exc()},
                    context_id=workflow_id
                )

                # Log error status
                if log_func:
                    status_message = self.protocol.create_status_update_message(
                        status_description=f"Error during web research: {str(e)}",
                        sender_id=self.id,
                        status="error",
                        progress=1.0,
                        context_id=workflow_id
                    )
                    log_func(workflow_id, status_message.content, "error", self.id, self.name)

                response["content"] = f"## Error during web research\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"

                return response

        except Exception as e:
            error_msg = f"Error during web research: {str(e)}"
            logging_utils.error(error_msg, "error")

            # Create error response using protocol
            error_message = self.protocol.create_error_message(
                error_description=error_msg,
                sender_id=self.id,
                error_type="web_search",
                error_details={"traceback": traceback.format_exc()},
                context_id=workflow_id
            )

            # Log error status
            if log_func:
                status_message = self.protocol.create_status_update_message(
                    status_description=f"Error during web research: {str(e)}",
                    sender_id=self.id,
                    status="error",
                    progress=1.0,
                    context_id=workflow_id
                )
                log_func(workflow_id, status_message.content, "error", self.id, self.name)

            response["content"] = f"## Error during web research\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"

            return response

    async def _is_web_research_request_ai(self, prompt: str) -> bool:
        """
        Uses AI to determine if a prompt requires web research, making it language-agnostic.

        Args:
            prompt: The user prompt

        Returns:
            True if this is explicitly a web research request, False otherwise
        """
        if not self.ai_service:
            # Fallback to simpler detection if AI service isn't available
            return self._simple_web_detection(prompt)

        try:
            # Create a prompt to analyze whether this is a web research request
            analysis_prompt = f"""
            Analyze the following request and determine if it explicitly requires web research or online information.

            REQUEST: {prompt}

            A request requires web research if:
            1. It explicitly asks to search for information online
            2. It contains URLs or references to websites
            3. It requests current information that would be available on the web
            4. It asks to find information from web sources
            5. It implicitly requires up-to-date information from the internet

            ONLY respond with a single word - either "YES" if web research is required, or "NO" if it is not.
            DO NOT include any explanation, just the answer YES or NO.
            """

            # Call AI to analyze
            response = await self.ai_service.call_api(
                messages=[
                    {
                        "role": "system",
                        "content": "You determine if a request requires web research. Always answer with only YES or NO."
                    },
                    {
                        "role": "user",
                        "content": analysis_prompt
                    }
                ]
            )

            # Clean the response
            response = response.strip().upper()

            # Check if the response indicates it's a web research task
            if "YES" in response:
                return True
            else:
                return False

        except Exception as e:
            # Log error but don't fail, fall back to simpler detection
            logger.warning(f"Error using AI to detect web research request: {str(e)}")
            return self._simple_web_detection(prompt)

    def _simple_web_detection(self, prompt: str) -> bool:
        """
        Simpler fallback method to detect web research requests based on URLs.

        Args:
            prompt: The user prompt

        Returns:
            True if there are clear URL indicators, False otherwise
        """
        # URLs in the prompt strongly indicate web research
        url_indicators = ["http://", "https://", "www.", ".com", ".org", ".net", ".edu", ".gov"]

        # Check for URL patterns in the prompt
        contains_url = any(indicator in prompt.lower() for indicator in url_indicators)

        return contains_url

    async def _get_localized_headers(self, language: str) -> Dict[str, str]:
        """
        Get localized headers for the web research results based on detected language.

        Args:
            language: The detected language

        Returns:
            Dictionary with localized headers
        """
        # Default English headers
        headers = {
            "web_research_results": "Web Research Results",
            "summary": "Summary",
            "detailed_results": "Detailed Results",
            "url": "URL",
            "snippet": "Snippet",
            "content": "Content"
        }

        # If language detection failed or is English, return defaults
        if not language or language.lower() in ["english", "en"]:
            return headers

        try:
            # Use AI to translate headers to the detected language
            translation_prompt = f"""
            Translate these web research result headers to {language}:

            Web Research Results
            Summary
            Detailed Results
            URL
            Snippet
            Content

            Return a JSON object with these keys:
            web_research_results, summary, detailed_results, url, snippet, content
            """

            # Call AI for translation
            response = await self.ai_service.call_api(
                messages=[
                    {
                        "role": "system",
                        "content": "You translate headers to the specified language and return them as JSON."
                    },
                    {
                        "role": "user",
                        "content": translation_prompt
                    }
                ]
            )

            # Extract JSON
            import re
            json_match = re.search(r'\{.*\}', response, re.DOTALL)

            if json_match:
                translated_headers = json.loads(json_match.group(0))
                return translated_headers

        except Exception as e:
            # Log error but continue with English headers
            logger.warning(f"Error translating headers to {language}: {str(e)}")

        return headers

    async def get_prompt(self, message_context: Dict[str, Any]) -> str:
        task = message_context.get("content", "")
        return task.strip()

    async def get_web_query(self, message_context: Dict[str, Any]) -> str:
        prompt = await self.get_prompt(message_context)
        result_json = await self.run_web_query(prompt)
        result_data = ""
        summary_src = ""

        logger.info(f"Web analysis prompt '{prompt}' delivers {len(result_json)} results.")
        if isinstance(result_json, list):
            total_tokens = 0

            for i, result in enumerate(result_json, 1):
                # Limit content size for each result
                result_data_limited = self.limit_text_for_api(result['data'], max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))  # Allow ~15000 tokens per result

                web_answer_instructions = f"""
                Summarize this search result according to the original request in approximately 2000 characters. Original request = '{prompt.replace("'","")}'
                Focus on the most important insights and connect them to the original request. You can skip any introduction.
                Extract only relevant and high-quality information related to the request, and present it in a clear format. Provide a balanced view of the researched information.

                Here is the search result:
                {result_data_limited}
                """

                # Count tokens in the instructions to ensure we don't exceed API limits
                instruction_tokens = self.count_tokens(web_answer_instructions)
                if total_tokens + instruction_tokens > 60000:
                    logger.warning(f"Skipping result {i} to avoid exceeding token limit")
                    break

                total_tokens += instruction_tokens

                # Additional instructions for web research
                content_text = await self.ai_service.call_api(
                    messages=[
                        {
                            "role": "system",
                            "content": "You are an information analyst who precisely and relevantly summarizes web content."
                        },
                        {
                            "role": "user",
                            "content": web_answer_instructions
                        }
                    ]
                )

                # Create a summary but ensure we stay within token limits
                content_summary = content_text[:2000]  # Limit to ~2000 characters
                result_data += f"\n\n[{i}] {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\nContent: {content_summary}"
                summary_src += f"\n{content_summary}"

                # Update token count
                total_tokens += self.count_tokens(content_summary) + 100  # Add buffer for formatting
        else:
            result_data = "no data received"

        logger.info(f"Web analysis result sent {len(result_data)}B")

        # Additional summary
        summary = ""
        if len(summary_src) > 1:
            # Limit summary source to ensure we don't exceed API limits
            summary_src_limited = self.limit_text_for_api(summary_src, max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))

            summary = await self.ai_service.call_api(
                messages=[
                    {
                        "role": "system",
                        "content": "You create concise summaries of research findings."
                    },
                    {
                        "role": "user",
                        "content": f"Please summarize these findings in 5-6 sentences: {summary_src_limited}\n"
                    }
                ]
            )

        # Format the final result
        result = f"## Web Research Results\n\n### Summary\n{summary}\n\n### Detailed Results{result_data}"
        return result

    async def run_web_query(self, prompt: str) -> List[Dict]:
        if prompt=="":
            return []

        ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.

        'url': A list of maximum {self.max_url} specific URLs extracted from the task string.

        'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.

        Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
        """

        content_text = await self.ai_service.call_api(
            messages=[
                {
                    "role": "system",
                    "content": "You are a web research expert who develops precise search strategies."
                },
                {
                    "role": "user",
                    "content": ptext
                }
            ]
        )
        # Remove markdown formatting if present
        if content_text.startswith("```json"):
            # Find the end of the JSON block
            end_marker = "```"
            end_index = content_text.rfind(end_marker)
            if end_index != -1:
                # Extract the JSON content without the markdown markers
                content_text = content_text[7:end_index].strip()

        # Now parse the JSON
        try:
            logger.info(f"Valid json received: {str(content_text)}")
            pjson = json.loads(content_text)
            # Now call scrape_json with the parsed dictionary
            result_json = await self.scrape_json(pjson)
            return result_json
        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse JSON: {e}")
            logger.error(f"Cleaned content: {content_text[:100]}...")
            return []

    async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
        """
        Scrapes web content based on a research strategy JSON.

        Args:
            research_strategy: A dictionary containing:
                - 'skey': List of search keywords
                - 'url': List of direct URLs to scrape

        Returns:
            Dictionary with URLs as keys and scraped content as values
        """

        logger.info("Starting JSON-based web scraping")
        results = []

        # Validate input structure
        if not isinstance(research_strategy, dict):
            logger.error("Invalid research_strategy format: not a dictionary")
            return {"error": "Invalid research_strategy format: not a dictionary"}

        keys = research_strategy.get("skey", [])
        direct_urls = research_strategy.get("url", [])

        if not isinstance(keys, list) or not isinstance(direct_urls, list):
            logger.error("Invalid research_strategy format: keys, or url is not a list")
            return {"error": "Invalid research_strategy format: keys, or url is not a list"}

        # Process search keywords through search engine
        for keyword in keys:
            logger.info(f"Processing keyword: {keyword}")
            found_results = self.search_web(keyword) #  List with Dict: title,url,snippet,data
            logger.info(f"... {len(found_results)} results found")
            results.extend(found_results)

        # Process direct URLs
        logger.info(f"Processing {len(direct_urls)} direct URLs")
        for url in direct_urls:
            if url in results:
                logger.info(f"Skipping already scraped URL: {url}")
                continue
            soup = self.read_url(url)

            # Extract title from the page if it exists
            if isinstance(soup, BeautifulSoup):
                title_tag = soup.find('title')
                title = title_tag.text.strip() if title_tag else "No title"

                # Alternative: You could also look for h1 tags if the title tag is missing
                if title == "No title":
                    h1_tag = soup.find('h1')
                    if h1_tag:
                        title = h1_tag.text.strip()
            else:
                # Handle the case where soup is an error message string
                title = "Error fetching page"

            results.append(self.parse_result(soup, title, url))
        logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
        return results

    def extract_main_content(self, soup: BeautifulSoup, max_chars: int = int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) -> str:
        """
        Extract the main content from an HTML page while limiting character count.

        Args:
            soup: BeautifulSoup object containing the page content
            max_chars: Maximum number of characters to extract

        Returns:
            Extracted main content as string
        """
        if not isinstance(soup, BeautifulSoup):
            return str(soup)[:max_chars]

        # Try to find main content elements in order of priority
        main_content = None
        for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
            content = soup.select_one(selector)
            if content:
                main_content = content
                break

        # If no main content found, use the body
        if not main_content:
            main_content = soup.find('body') or soup

        # Remove script, style, nav, footer elements that don't contribute to main content
        for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
            element.extract()

        # Extract text content
        text_content = main_content.get_text(separator=' ', strip=True)

        # Limit to max_chars
        return text_content[:max_chars]

    def tokenize_for_counting(self, text: str) -> List[str]:
        """
        Simple token counter for estimating token usage.
        This is an approximation since the exact tokenization depends on the model.

        Args:
            text: Input text

        Returns:
            List of tokens
        """
        # Simple tokenization by splitting on whitespace and punctuation
        import re
        return re.findall(r'\w+|[^\w\s]', text)

    def count_tokens(self, text: str) -> int:
        """
        Count the approximate number of tokens in a text.

        Args:
            text: Input text

        Returns:
            Estimated token count
        """
        tokens = self.tokenize_for_counting(text)
        return len(tokens)

    def limit_text_for_api(self, text: str, max_tokens: int = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_TOKENS"))) -> str:
        """
        Limit the text to a maximum number of tokens.

        Args:
            text: Input text
            max_tokens: Maximum number of tokens allowed

        Returns:
            Limited text
        """
        if not text:
            return ""

        tokens = self.tokenize_for_counting(text)

        # If text is already under the limit, return as is
        if len(tokens) <= max_tokens:
            return text

        # Otherwise, truncate text to max_tokens
        return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]"

    def search_web(self, query: str) -> List[Dict]:
        formatted_query = quote_plus(query)
        url = f"{APP_CONFIG("Connector_AiWebscraping_SEARCH_ENGINE")}{formatted_query}"

        search_results_soup = self.read_url(url)
        if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
            logger.warning(f"No search results found for: {query}")
            return []

        # Extract search results
        results = []

        # Find all result containers
        result_elements = search_results_soup.select('.result')

        for result in result_elements:
            # Extract title
            title_element = result.select_one('.result__a')
            title = title_element.text.strip() if title_element else 'No title'

            # Extract URL (DuckDuckGo uses redirects, need to extract from href param)
            url_element = title_element.get('href') if title_element else ''
            extracted_url = 'No URL'

            if url_element:
                # Extract the actual URL from DuckDuckGo's redirect
                if url_element.startswith('/d.js?q='):
                    start = url_element.find('?q=') + 3  # Skip '?q='
                    end = url_element.find('&', start) if '&' in url_element[start:] else None
                    extracted_url = unquote(url_element[start:end])

                    # Make sure the URL has the correct protocol prefix
                    if not extracted_url.startswith(('http://', 'https://')):
                        if not extracted_url.startswith('//'):
                            extracted_url = 'https://' + extracted_url
                        else:
                            extracted_url = 'https:' + extracted_url
                else:
                    extracted_url = url_element

            # Extract snippet directly from search results page
            snippet_element = result.select_one('.result__snippet')
            snippet = snippet_element.text.strip() if snippet_element else 'No description'

            # Now fetch the actual page content for the data field
            target_page_soup = self.read_url(extracted_url)

            # Use the new content extraction method to limit content size
            content = self.extract_main_content(target_page_soup, max_chars=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))

            results.append({
                'title': title,
                'url': extracted_url,
                'snippet': snippet,
                'data': content
            })

            # Limit the number of results if needed
            if len(results) >= self.max_result:
                break

        return results

    def read_url(self, url: str) -> BeautifulSoup:
        """
        Reads a URL and returns a BeautifulSoup parser for the content.
        Returns an empty BeautifulSoup object for errors.

        Args:
            url: The URL to read

        Returns:
            BeautifulSoup object with the content or empty for errors
        """
        headers = {
            'User-Agent': APP_CONFIG("Connector_AiWebscraping_USER_AGENT"),
            'Accept': 'text/html,application/xhtml+xml,application/xml',
            'Accept-Language': 'en-US,en;q=0.9',
        }

        try:
            # Initial request
            response = requests.get(url, headers=headers, timeout=int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT")))

            # Polling for status 202
            if response.status_code == 202:
                # Maximum 3 attempts with increasing intervals
                backoff_times = [0.5, 1.0, 2.0, 5.0]  # 0.5s, then 1s, then 2s

                for wait_time in backoff_times:
                    time.sleep(wait_time)  # Wait with increasing time
                    response = requests.get(url, headers=headers, timeout=int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT")))

                    # If no 202 anymore, then break
                    if response.status_code != 202:
                        break

            # For other error statuses, raise an error
            response.raise_for_status()

            # Parse HTML
            return BeautifulSoup(response.text, 'html.parser')

        except Exception as e:
            # Create empty BeautifulSoup object
            return BeautifulSoup("<html><body></body></html>", 'html.parser')

    def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
        """
        Parse a BeautifulSoup object into a result dictionary.

        Args:
            data: BeautifulSoup object containing the page content
            title: Page title
            url: Page URL

        Returns:
            Dictionary with result data
        """
        # Extract content using the main content extraction method
        content = self.extract_main_content(data, max_chars=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS")))

        result = {
            'title': title,
            'url': url,
            'snippet': 'No description',  # Default value
            'data': content
        }
        return result


# Singleton instance
_webcrawler_agent = None

def get_webcrawler_agent():
    """Returns a singleton instance of the WebCrawler Agent"""
    global _webcrawler_agent
    if _webcrawler_agent is None:
        _webcrawler_agent = WebcrawlerAgent()
    return _webcrawler_agent