gateway/modules/chat_agent_webcrawler.py

"""
Webcrawler agent for research and retrieval of information from the web.
Optimized for the new task-based processing.
"""

import logging
import json
import re
import time
from typing import Dict, Any, List
from urllib.parse import quote_plus, unquote

from bs4 import BeautifulSoup
import requests
import markdown

from modules.chat_registry import AgentBase
from modules.configuration import APP_CONFIG

logger = logging.getLogger(__name__)

class AgentWebcrawler(AgentBase):
    """Agent for web research and information retrieval"""

    def __init__(self):
        """Initialize the webcrawler agent"""
        super().__init__()
        self.name = "webcrawler"
        self.description = "Conducts web research and collects information from online sources"
        self.capabilities = [
            "web_search",
            "information_retrieval",
            "data_collection",
            "search_results_analysis",
            "webpage_content_extraction"
        ]

        # Web crawling configuration
        self.max_url = int(APP_CONFIG.get("Agent_Webcrawler_MAX_URLS", "5"))
        self.max_key = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_KEYWORDS", "3"))
        self.max_result = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_RESULTS", "5"))
        self.timeout = int(APP_CONFIG.get("Agent_Webcrawler_TIMEOUT", "30"))

    def set_dependencies(self, ai_service=None):
        """Set external dependencies for the agent."""
        self.ai_service = ai_service


    async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process a standardized task structure and conduct web research.

        Args:
            task: A dictionary containing:
                - task_id: Unique ID for this task
                - prompt: The main instruction for the agent
                - input_documents: List of documents to process
                - output_specifications: List of required output documents
                - context: Additional contextual information

        Returns:
            A dictionary containing:
                - feedback: Text response explaining the research results
                - documents: List of created document objects
        """
        try:
            # Extract relevant task information
            prompt = task.get("prompt", "")
            output_specs = task.get("output_specifications", [])

            # Check if AI service is available
            if not self.ai_service:
                logger.error("No AI service configured for the Webcrawler agent")
                return {
                    "feedback": "The Webcrawler agent is not properly configured.",
                    "documents": []
                }

            # Check if this is a web research request
            is_web_research = await self._is_web_research_request(prompt)
            if not is_web_research:
                logger.info("Request rejected: not a web research task")
                return {
                    "feedback": "This request doesn't appear to require web research.",
                    "documents": []
                }

            # Proceed with web research
            logger.info(f"Web research for: {prompt[:50]}...")

            # Create search strategy
            search_strategy = await self._create_search_strategy(prompt)
            search_keys = search_strategy.get("skey", [])
            search_urls = search_strategy.get("url", [])

            if search_keys:
                logger.info(f"Searching for {len(search_keys)} key terms: {', '.join(search_keys[:2])}...")

            if search_urls:
                logger.info(f"Searching in {len(search_urls)} direct URLs: {', '.join(search_urls[:2])}...")

            # Execute search
            results = []

            # Process search terms
            for keyword in search_keys:
                logger.info(f"Searching the web for: '{keyword}'")
                keyword_results = self._search_web(keyword)
                results.extend(keyword_results)
                logger.info(f"Found: {len(keyword_results)} results for '{keyword}'")

            # Process direct URLs
            for url in search_urls:
                logger.info(f"Extracting content from: {url}")
                soup = self._read_url(url)

                # Extract title from the page, if available
                title = self._extract_title(soup, url)

                result = self._parse_result(soup, title, url)
                results.append(result)
                logger.info(f"Extracted: '{title}' from {url}")

            # Process results for final output
            logger.info(f"Analyzing {len(results)} web results")

            # Generate summaries for each result
            processed_results = []
            for i, result in enumerate(results):
                result_data_limited = self._limit_text(result['data'], max_chars=10000)

                logger.info(f"Analyzing result {i+1}/{len(results)}: {result['title'][:30]}...")

                # No AI service available, create minimal summary
                if not self.ai_service:
                    content_summary = f"Extract from {result['url']} ({len(result_data_limited)} characters)"
                else:
                    # Generate summary with AI
                    content_summary = await self._summarize_result(result_data_limited, prompt)

                processed_result = {
                    "title": result['title'],
                    "url": result['url'],
                    "snippet": result['snippet'],
                    "summary": content_summary
                }

                processed_results.append(processed_result)

            # Create overall summary
            all_summaries = "\n\n".join([r["summary"] for r in processed_results])
            all_summaries_limited = self._limit_text(all_summaries, max_chars=10000)

            logger.info("Creating overall summary of web research")

            if not self.ai_service:
                final_summary = f"Summary of {len(processed_results)} web research results"
            else:
                final_summary = await self.ai_service.call_api([
                    {"role": "system", "content": "You create concise summaries of research results."},
                    {"role": "user", "content": f"Please summarize these findings in 5-6 sentences: {all_summaries_limited}\n"}
                ])

            # Get localized headers for output
            headers = await self._get_localized_headers(prompt)

            # Create document objects based on output specifications
            generated_documents = []

            # Generate appropriate document for each requested output
            for spec in output_specs:
                output_label = spec.get("label", "")
                output_description = spec.get("description", "")

                # Determine output format based on file extension
                format_type = self._determine_format_type(output_label)

                # Generate content based on format and requirements
                if format_type == "markdown" or format_type == "text":
                    content = self._format_results_as_markdown(processed_results, final_summary, headers)
                elif format_type == "html":
                    md_content = self._format_results_as_markdown(processed_results, final_summary, headers)
                    content = markdown.markdown(md_content)
                elif format_type == "json":
                    content = json.dumps({
                        "summary": final_summary,
                        "results": processed_results
                    }, indent=2, ensure_ascii=False)
                elif format_type == "csv":
                    csv_lines = ["Title,URL,Snippet"]
                    for result in processed_results:
                        # Escape commas and quotes in fields
                        title = result["title"].replace('"', '""')
                        url = result["url"].replace('"', '""')
                        snippet = result["snippet"].replace('"', '""')
                        csv_line = f'"{title}","{url}","{snippet}"'
                        csv_lines.append(csv_line)
                    content = "\n".join(csv_lines)
                else:
                    # Default: Markdown
                    content = self._format_results_as_markdown(processed_results, final_summary, headers)

                # Add document to results list
                generated_documents.append({
                    "label": output_label,
                    "content": content
                })

            # If no specific outputs requested, return standard document
            if not output_specs:
                content = self._format_results_as_markdown(processed_results, final_summary, headers)
                generated_documents.append({
                    "label": "web_research_results.md",
                    "content": content
                })

            # Create feedback for response
            feedback = f"I conducted web research on '{prompt[:50]}...' and found {len(processed_results)} relevant results."

            logger.info("Web research completed successfully")

            return {
                "feedback": feedback,
                "documents": generated_documents
            }

        except Exception as e:
            error_msg = f"Error during web research: {str(e)}"
            logger.error(error_msg)
            return {
                "feedback": f"An error occurred during the web research: {str(e)}",
                "documents": []
            }


    def _determine_format_type(self, output_label: str) -> str:
        """
        Determine the format type based on the filename.

        Args:
            output_label: Output filename

        Returns:
            Format type (markdown, html, text, json, csv)
        """
        output_label_lower = output_label.lower()

        if output_label_lower.endswith(".md"):
            return "markdown"
        elif output_label_lower.endswith(".html"):
            return "html"
        elif output_label_lower.endswith(".txt"):
            return "text"
        elif output_label_lower.endswith(".json"):
            return "json"
        elif output_label_lower.endswith(".csv"):
            return "csv"
        else:
            # Default to markdown
            return "markdown"

    def _format_results_as_markdown(self, results: List[Dict[str, Any]],
                                  summary: str, headers: Dict[str, str]) -> str:
        """
        Format research results as markdown.

        Args:
            results: List of results
            summary: Summary of all results
            headers: Localized headers

        Returns:
            Formatted markdown text
        """
        md_content = f"# {headers['web_research_results']}\n\n"

        md_content += f"## {headers['summary']}\n\n{summary}\n\n"

        if results:
            md_content += f"## {headers['detailed_results']}\n\n"

            for i, result in enumerate(results, 1):
                md_content += f"### {i}. {result['title']}\n\n"
                md_content += f"**{headers['url']}**: {result['url']}\n\n"
                md_content += f"**{headers['snippet']}**: {result['snippet']}\n\n"
                md_content += f"**{headers['content']}**: {result['summary']}\n\n"

                # Add separator between results (except for the last one)
                if i < len(results):
                    md_content += "---\n\n"

        return md_content

    async def _is_web_research_request(self, prompt: str) -> bool:
        """
        Use AI to determine if a request requires web research.

        Args:
            prompt: The user request

        Returns:
            True if it is explicitly a web research request, False otherwise
        """
        if not self.ai_service:
            # Fallback to simpler detection if no AI service is available
            return self._simple_web_detection(prompt)

        try:
            # Create prompt to analyze if this is a web research request
            analysis_prompt = f"""
            Analyze the following request and determine if it explicitly requires web research or online information.

            REQUEST: {prompt}

            A request requires web research if:
            1. It explicitly asks for searching information online
            2. It contains URLs or references to websites
            3. It requests current information that would be available on the web
            4. It asks for information from web sources
            5. It implicitly requires current information from the internet

            Reply ONLY with a single word - either "YES" if web research is required, or "NO" if not.
            """

            # Call AI for analysis
            response = await self.ai_service.call_api([
                {"role": "system", "content": "You determine if a request requires web research. Always respond with just YES or NO."},
                {"role": "user", "content": analysis_prompt}
            ])

            # Clean response and check
            response = response.strip().upper()

            return "YES" in response

        except Exception as e:
            # Log error but don't fail, fallback to simpler detection
            logger.warning(f"Error in AI detection of web research requests: {str(e)}")
            return self._simple_web_detection(prompt)

    def _simple_web_detection(self, prompt: str) -> bool:
        """
        Simpler fallback method for detecting web research requests based on URLs.

        Args:
            prompt: The user request

        Returns:
            True if there are clear URL indicators, False otherwise
        """
        # URLs in the request strongly indicate web research
        url_indicators = ["http://", "https://", "www.", ".com", ".org", ".net", ".edu", ".gov"]
        web_terms = ["search", "find online", "look up", "web", "internet", "website"]

        # Check for URL patterns in the request
        contains_url = any(indicator in prompt.lower() for indicator in url_indicators)
        contains_web_term = any(term in prompt.lower() for term in web_terms)

        return contains_url or contains_web_term

    async def _create_search_strategy(self, prompt: str) -> Dict[str, List[str]]:
        """
        Create a search strategy based on the request.

        Args:
            prompt: The user request

        Returns:
            Search strategy with URLs and search terms
        """
        if not self.ai_service:
            # Fallback to simple strategy
            return {"skey": [prompt], "url": []}

        try:
            # AI prompt to create a search strategy
            strategy_prompt = f"""Create a comprehensive web research strategy for the following task:
            '{prompt.replace("'","")}'

            Return the results as a Python dictionary with these specific keys:

            'url': A list of up to {self.max_url} specific URLs extracted from the task.

            'skey': A list of up to {self.max_key} key phrases to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.

            If specific URLs are given and the task only requires analyzing these URLs, leave 'skey' empty.

            Format your response as a valid JSON object with these two keys. Don't add any explanatory text.
            """

            # Call AI for search strategy
            content_text = await self.ai_service.call_api([
                {"role": "system", "content": "You are a web research expert who develops precise search strategies."},
                {"role": "user", "content": strategy_prompt}
            ])

            # Remove JSON code block markers if present
            if content_text.startswith("```json"):
                end_marker = "```"
                end_index = content_text.rfind(end_marker)
                if end_index != -1:
                    content_text = content_text[7:end_index].strip()
            elif content_text.startswith("```"):
                end_marker = "```"
                end_index = content_text.rfind(end_marker)
                if end_index != -1:
                    content_text = content_text[3:end_index].strip()

            # Extract only the JSON part (if surrounded by text)
            json_match = re.search(r'(\{.*\})', content_text, re.DOTALL)
            if json_match:
                content_text = json_match.group(1)

            # Parse JSON and return
            strategy = json.loads(content_text)
            return strategy

        except Exception as e:
            logger.error(f"Error creating search strategy: {str(e)}")
            # Simple fallback strategy
            return {"skey": [prompt], "url": []}

    async def _summarize_result(self, result_data: str, original_prompt: str) -> str:
        """
        Create a summary of a search result using AI.

        Args:
            result_data: The data to summarize
            original_prompt: The original request

        Returns:
            Summary of the result
        """
        if not self.ai_service:
            return f"Summary of {len(result_data)} characters not available (AI service not available)"

        try:
            # Instructions for summarization
            summary_prompt = f"""
            Summarize this search result according to the original request in about 2000 characters.

            Original request = '{original_prompt.replace("'","")}'

            Focus on the most important findings and connect them to the original request.
            Extract only relevant and high-quality information.

            Here's the search result:
            {result_data}
            """

            # Call AI for summary
            summary = await self.ai_service.call_api([
                {"role": "system", "content": "You are an information analyst who summarizes web content precisely and relevantly."},
                {"role": "user", "content": summary_prompt}
            ])

            # Limit to ~2000 characters
            return summary[:2000]

        except Exception as e:
            logger.error(f"Error summarizing result: {str(e)}")
            return "Error creating summary"

    async def _get_localized_headers(self, text: str) -> Dict[str, str]:
        """
        Determine localized headers for web research results based on detected language.

        Args:
            text: Text for language detection

        Returns:
            Dictionary with localized headers
        """
        # Default English headers
        headers = {
            "web_research_results": "Web Research Results",
            "summary": "Summary",
            "detailed_results": "Detailed Results",
            "url": "URL",
            "snippet": "Snippet",
            "content": "Content"
        }

        if not self.ai_service:
            return headers

        try:
            # Detect language
            language_prompt = f"What language is this text written in? Answer with just the language name: {text[:200]}"
            language = await self.ai_service.call_api([
                {"role": "system", "content": "You determine the language of a text and return only the language name."},
                {"role": "user", "content": language_prompt}
            ])

            language = language.strip().lower()

            # English language or language detection failed, return default headers
            if language in ["english", "en", ""]:
                return headers

            # Translate headers if language recognized but no predefined translation
            translation_prompt = f"""
            Translate these web research result headers to {language}:

            Web Research Results
            Summary
            Detailed Results
            URL
            Snippet
            Content

            Return a JSON object with these keys:
            web_research_results, summary, detailed_results, url, snippet, content
            """

            # Call AI for translation
            response = await self.ai_service.call_api([
                {"role": "system", "content": "You translate headers to the specified language and return them as JSON."},
                {"role": "user", "content": translation_prompt}
            ])

            # Extract JSON
            json_match = re.search(r'\{.*\}', response, re.DOTALL)

            if json_match:
                try:
                    translated_headers = json.loads(json_match.group(0))
                    return translated_headers
                except json.JSONDecodeError:
                    logger.warning(f"Error parsing translated headers JSON")

        except Exception as e:
            # Log error but continue with English headers
            logger.warning(f"Error translating headers: {str(e)}")

        return headers

    def _search_web(self, query: str) -> List[Dict[str, str]]:
        """
        Conduct a web search and return the results.

        Args:
            query: The search query

        Returns:
            List of search results
        """
        formatted_query = quote_plus(query)
        url = f"{APP_CONFIG.get('Agent_Webcrawler_SEARCH_ENGINE', 'https://html.duckduckgo.com/html/?q=')}{formatted_query}"

        search_results_soup = self._read_url(url)
        if not isinstance(search_results_soup, BeautifulSoup) or not search_results_soup.select('.result'):
            logger.warning(f"No search results found for: {query}")
            return []

        # Extract search results
        results = []

        # Find all result containers
        result_elements = search_results_soup.select('.result')

        for result in result_elements:
            # Extract title
            title_element = result.select_one('.result__a')
            title = title_element.text.strip() if title_element else 'No title'

            # Extract URL (DuckDuckGo uses redirects)
            url_element = title_element.get('href') if title_element else ''
            extracted_url = 'No URL'

            if url_element:
                # Extract actual URL from DuckDuckGo's redirect
                if url_element.startswith('/d.js?q='):
                    start = url_element.find('?q=') + 3
                    end = url_element.find('&', start) if '&' in url_element[start:] else None
                    extracted_url = unquote(url_element[start:end])

                    # Ensure URL has correct protocol prefix
                    if not extracted_url.startswith(('http://', 'https://')):
                        if not extracted_url.startswith('//'):
                            extracted_url = 'https://' + extracted_url
                        else:
                            extracted_url = 'https:' + extracted_url
                else:
                    extracted_url = url_element

            # Extract snippet directly from search results page
            snippet_element = result.select_one('.result__snippet')
            snippet = snippet_element.text.strip() if snippet_element else 'No description'

            # Get actual page content for the data field
            target_page_soup = self._read_url(extracted_url)

            # Use new content extraction method to limit content size
            content = self._extract_main_content(target_page_soup)

            results.append({
                'title': title,
                'url': extracted_url,
                'snippet': snippet,
                'data': content
            })

            # Limit number of results if needed
            if len(results) >= self.max_result:
                break

        return results

    def _read_url(self, url: str) -> BeautifulSoup:
        """
        Read a URL and return a BeautifulSoup parser for the content.

        Args:
            url: The URL to read

        Returns:
            BeautifulSoup object with the content or empty on errors
        """
        headers = {
            'User-Agent': APP_CONFIG.get("Agent_Webcrawler_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"),
            'Accept': 'text/html,application/xhtml+xml,application/xml',
            'Accept-Language': 'en-US,en;q=0.9',
        }

        try:
            # Initial request
            response = requests.get(url, headers=headers, timeout=self.timeout)

            # Handling for status 202
            if response.status_code == 202:
                # Max 3 retries with increasing intervals
                backoff_times = [0.5, 1.0, 2.0, 5.0]

                for wait_time in backoff_times:
                    time.sleep(wait_time)  # Wait with increasing time
                    response = requests.get(url, headers=headers, timeout=self.timeout)

                    # If no more 202, break
                    if response.status_code != 202:
                        break

            # Raise for other error status codes
            response.raise_for_status()

            # Parse HTML
            return BeautifulSoup(response.text, 'html.parser')

        except Exception as e:
            logger.error(f"Error reading URL {url}: {str(e)}")
            # Create empty BeautifulSoup object
            return BeautifulSoup("<html><body></body></html>", 'html.parser')

    def _extract_title(self, soup: BeautifulSoup, url: str) -> str:
        """
        Extract the title from a webpage.

        Args:
            soup: BeautifulSoup object of the webpage
            url: URL of the webpage

        Returns:
            Extracted title
        """
        if not isinstance(soup, BeautifulSoup):
            return f"Error with {url}"

        # Extract title from title tag
        title_tag = soup.find('title')
        title = title_tag.text.strip() if title_tag else "No title"

        # Alternative: Also look for h1 tags if title tag is missing
        if title == "No title":
            h1_tag = soup.find('h1')
            if h1_tag:
                title = h1_tag.text.strip()

        return title

    def _extract_main_content(self, soup: BeautifulSoup, max_chars: int = 10000) -> str:
        """
        Extract the main content from an HTML page.

        Args:
            soup: BeautifulSoup object of the webpage
            max_chars: Maximum number of characters

        Returns:
            Extracted main content as a string
        """
        if not isinstance(soup, BeautifulSoup):
            return str(soup)[:max_chars] if soup else ""

        # Try to find main content elements in priority order
        main_content = None
        for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
            content = soup.select_one(selector)
            if content:
                main_content = content
                break

        # If no main content found, use the body
        if not main_content:
            main_content = soup.find('body') or soup

        # Remove script, style, nav, footer elements that don't contribute to main content
        for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
            element.extract()

        # Extract text content
        text_content = main_content.get_text(separator=' ', strip=True)

        # Limit to max_chars
        return text_content[:max_chars]

    def _parse_result(self, soup: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
        """
        Parse a BeautifulSoup object into a result dictionary.

        Args:
            soup: BeautifulSoup object of the webpage
            title: Page title
            url: Page URL

        Returns:
            Dictionary with result data
        """
        # Extract content
        content = self._extract_main_content(soup)

        result = {
            'title': title,
            'url': url,
            'snippet': 'No description',  # Default value
            'data': content
        }
        return result

    def _limit_text(self, text: str, max_chars: int = 10000) -> str:
        """
        Limit text to a maximum number of characters.

        Args:
            text: Input text
            max_chars: Maximum number of characters

        Returns:
            Limited text
        """
        if not text:
            return ""

        # If text is already under the limit, return unchanged
        if len(text) <= max_chars:
            return text

        # Otherwise limit text to max_chars
        return text[:max_chars] + "... [Content truncated due to length]"


# Factory function for the Webcrawler agent
def get_webcrawler_agent():
    """
    Factory function that returns an instance of the Webcrawler agent.

    Returns:
        An instance of the Webcrawler agent
    """
    return AgentWebcrawler()