gateway/modules/chat_agent_webcrawler.py

"""
Webcrawler agent for research and retrieval of information from the web.
Reimagined with an output-first, AI-driven approach.
"""

import logging
import json
import re
import time
from typing import Dict, Any, List
from urllib.parse import quote_plus, unquote

from bs4 import BeautifulSoup
import requests
import markdown

from modules.chat_registry import AgentBase
from modules.configuration import APP_CONFIG

logger = logging.getLogger(__name__)

class AgentWebcrawler(AgentBase):
    """AI-driven agent for web research and information retrieval"""

    def __init__(self):
        """Initialize the webcrawler agent"""
        super().__init__()
        self.name = "webcrawler"
        self.description = "Conducts web research and collects information from online sources"
        self.capabilities = [
            "web_search",
            "information_retrieval",
            "data_collection",
            "search_results_analysis",
            "webpage_content_extraction"
        ]

        # Web crawling configuration
        self.max_url = int(APP_CONFIG.get("Agent_Webcrawler_MAX_URLS", "5"))
        self.max_search_terms = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_KEYWORDS", "3"))
        self.max_results = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_RESULTS", "5"))
        self.timeout = int(APP_CONFIG.get("Agent_Webcrawler_TIMEOUT", "30"))
        self.search_engine = APP_CONFIG.get("Agent_Webcrawler_SEARCH_ENGINE", "https://html.duckduckgo.com/html/?q=")
        self.user_agent = APP_CONFIG.get("Agent_Webcrawler_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")

    def set_dependencies(self, ai_service=None):
        """Set external dependencies for the agent."""
        self.ai_service = ai_service

    async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process a task by focusing on required outputs and using AI to guide the research process.

        Args:
            task: Task dictionary with prompt, input_documents, output_specifications

        Returns:
            Dictionary with feedback and documents
        """
        try:
            # Extract task information
            prompt = task.get("prompt", "")
            output_specs = task.get("output_specifications", [])

            # Check AI service
            if not self.ai_service:
                return {
                    "feedback": "The Webcrawler agent requires an AI service to function effectively.",
                    "documents": []
                }

            # Create research plan
            research_plan = await self._create_research_plan(prompt)

            # Check if this is truly a web research task
            if not research_plan.get("requires_web_research", True):
                return {
                    "feedback": "This task doesn't appear to require web research. Please try a different agent.",
                    "documents": []
                }

            # Gather raw material through web research
            raw_results = await self._gather_research_material(research_plan)

            # Format results into requested output documents
            documents = await self._create_output_documents(
                prompt,
                raw_results,
                output_specs,
                research_plan
            )

            # Generate feedback
            feedback = research_plan.get("feedback", f"I conducted web research on '{prompt[:50]}...' and gathered information from {len(raw_results)} relevant sources.")

            return {
                "feedback": feedback,
                "documents": documents
            }

        except Exception as e:
            logger.error(f"Error during web research: {str(e)}", exc_info=True)
            return {
                "feedback": f"Error during web research: {str(e)}",
                "documents": []
            }

    async def _create_research_plan(self, prompt: str) -> Dict[str, Any]:
        """
        Use AI to create a detailed research plan.

        Args:
            prompt: The research query

        Returns:
            Research plan dictionary
        """
        research_prompt = f"""
        Create a detailed web research plan for this task: "{prompt}"

        Analyze the request carefully and create a structured plan in JSON format with the following elements:
        {{
            "requires_web_research": true/false,  # Whether this genuinely requires web research
            "research_questions": ["question1", "question2", ...],  # 2-4 specific questions to answer
            "search_terms": ["term1", "term2", ...],  # Up to {self.max_search_terms} effective search terms
            "direct_urls": ["url1", "url2", ...],  # Any URLs directly mentioned in the request (up to {self.max_url})
            "expected_sources": ["type1", "type2", ...],  # Types of sources that would be most valuable
            "content_focus": "what specific content to extract or focus on",
            "feedback": "explanation of how the research will be conducted"
        }}

        Respond with ONLY the JSON object, no additional text or explanations.
        """

        try:
            # Get research plan from AI
            response = await self.ai_service.call_api([
                {"role": "system", "content": "You are a web research planning expert. Create precise research plans in JSON format only."},
                {"role": "user", "content": research_prompt}
            ])

            # Extract JSON
            json_start = response.find('{')
            json_end = response.rfind('}') + 1

            if json_start >= 0 and json_end > json_start:
                plan = json.loads(response[json_start:json_end])

                # Ensure we have the expected fields with defaults if missing
                if "search_terms" not in plan:
                    plan["search_terms"] = [prompt]
                if "direct_urls" not in plan:
                    plan["direct_urls"] = []
                if "research_questions" not in plan:
                    plan["research_questions"] = ["What information can be found about this topic?"]

                return plan
            else:
                # Fallback plan
                return {
                    "requires_web_research": True,
                    "research_questions": ["What information can be found about this topic?"],
                    "search_terms": [prompt],
                    "direct_urls": [],
                    "expected_sources": ["Web pages", "Articles"],
                    "content_focus": "Relevant information about the topic",
                    "feedback": f"I'll conduct web research on '{prompt}' and gather relevant information."
                }

        except Exception as e:
            logger.warning(f"Error creating research plan: {str(e)}")
            # Simple fallback plan
            return {
                "requires_web_research": True,
                "research_questions": ["What information can be found about this topic?"],
                "search_terms": [prompt],
                "direct_urls": [],
                "expected_sources": ["Web pages", "Articles"],
                "content_focus": "Relevant information about the topic",
                "feedback": f"I'll conduct web research on '{prompt}' and gather relevant information."
            }

    async def _gather_research_material(self, research_plan: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Gather research material based on the research plan.

        Args:
            research_plan: Research plan dictionary

        Returns:
            List of research results
        """
        all_results = []

        # Process direct URLs
        direct_urls = research_plan.get("direct_urls", [])[:self.max_url]
        for url in direct_urls:
            logger.info(f"Processing direct URL: {url}")
            try:
                # Fetch and extract content
                soup = self._read_url(url)

                if soup:
                    # Extract title and content
                    title = self._extract_title(soup, url)
                    content = self._extract_main_content(soup)

                    # Add to results
                    all_results.append({
                        "title": title,
                        "url": url,
                        "source_type": "direct_url",
                        "content": content,
                        "summary": ""  # Will be filled later
                    })
            except Exception as e:
                logger.warning(f"Error processing URL {url}: {str(e)}")

        # Process search terms
        search_terms = research_plan.get("search_terms", [])[:self.max_search_terms]
        for term in search_terms:
            logger.info(f"Searching for: {term}")
            try:
                # Perform search
                search_results = self._search_web(term)

                # Process each search result
                for result in search_results:
                    # Check if URL is already in results
                    if not any(r["url"] == result["url"] for r in all_results):
                        all_results.append({
                            "title": result["title"],
                            "url": result["url"],
                            "source_type": "search_result",
                            "content": result["data"],
                            "snippet": result["snippet"],
                            "summary": ""  # Will be filled later
                        })

                        # Stop if we've reached the maximum results
                        if len(all_results) >= self.max_results:
                            break
            except Exception as e:
                logger.warning(f"Error searching for {term}: {str(e)}")

            # Stop if we've reached the maximum results
            if len(all_results) >= self.max_results:
                break

        # Create summaries in parallel for all results
        all_results = await self._summarize_all_results(all_results, research_plan)

        return all_results

    async def _summarize_all_results(self, results: List[Dict[str, Any]], research_plan: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Create summaries for all research results.

        Args:
            results: List of research results
            research_plan: Research plan with questions and focus

        Returns:
            Results with added summaries
        """
        for i, result in enumerate(results):
            logger.info(f"Summarizing result {i+1}/{len(results)}: {result['title'][:30]}...")

            try:
                # Limit content length to avoid token issues
                content = self._limit_text(result.get("content", ""), max_chars=8000)
                research_questions = research_plan.get("research_questions", ["What relevant information does this page contain?"])
                content_focus = research_plan.get("content_focus", "Relevant information")

                # Create summary using AI
                summary_prompt = f"""
                Summarize this web page content based on these research questions:
                {', '.join(research_questions)}

                Focus on: {content_focus}

                Web page: {result['url']}
                Title: {result['title']}

                Content:
                {content}

                Create a concise summary that:
                1. Directly answers the research questions if possible
                2. Extracts the most relevant information from the page
                3. Includes specific facts, figures, or quotes if available
                4. Is around 2000 characters long

                Only include information actually found in the content. No fabrications or assumptions.
                """

                if self.ai_service:
                    summary = await self.ai_service.call_api([
                        {"role": "system", "content": "You summarize web content accurately and concisely, focusing only on what is actually in the content."},
                        {"role": "user", "content": summary_prompt}
                    ])

                    # Store the summary
                    result["summary"] = summary
                else:
                    # Fallback if no AI service
                    result["summary"] = f"Content from {result['url']} ({len(content)} characters)"

            except Exception as e:
                logger.warning(f"Error summarizing result {i+1}: {str(e)}")
                result["summary"] = f"Error creating summary: {str(e)}"

        return results

    async def _create_output_documents(self, prompt: str, results: List[Dict[str, Any]],
                                  output_specs: List[Dict[str, Any]], research_plan: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Create output documents based on research results and specifications.

        Args:
            prompt: Original research prompt
            results: List of research results
            output_specs: Output specifications
            research_plan: Research plan

        Returns:
            List of output documents
        """
        # If no output specs provided, create default output
        if not output_specs:
            output_specs = [{
                "label": "web_research_results.md",
                "description": "Comprehensive web research results"
            }]

        # Generate documents
        documents = []

        # Process each output specification
        for spec in output_specs:
            output_label = spec.get("label", "")
            output_description = spec.get("description", "")

            # Determine format based on file extension
            format_type = self._determine_format_type(output_label)

            # Create appropriate document based on format
            if format_type == "json":
                # JSON output - structured data
                document = await self._create_json_document(prompt, results, research_plan, output_label)
            elif format_type == "csv":
                # CSV output - tabular data
                document = await self._create_csv_document(results, output_label)
            else:
                # Text-based output (markdown, html, text) - narrative report
                document = await self._create_narrative_document(
                    prompt, results, research_plan, format_type, output_label, output_description
                )

            documents.append(document)

        return documents

    async def _create_narrative_document(self, prompt: str, results: List[Dict[str, Any]],
                                    research_plan: Dict[str, Any], format_type: str,
                                    output_label: str, output_description: str) -> Dict[str, Any]:
        """
        Create a narrative document (markdown, html, text) from research results.

        Args:
            prompt: Original research prompt
            results: Research results
            research_plan: Research plan
            format_type: Output format (markdown, html, text)
            output_label: Output filename
            output_description: Output description

        Returns:
            Document object
        """
        # Create content based on format
        if format_type == "markdown":
            content_type = "text/markdown"
            template_format = "markdown"
        elif format_type == "html":
            content_type = "text/html"
            template_format = "html"
        else:
            content_type = "text/plain"
            template_format = "text"

        # Prepare research context
        research_questions = research_plan.get("research_questions", [])
        search_terms = research_plan.get("search_terms", [])

        # Create document structure based on results
        sources_summary = []
        for result in results:
            sources_summary.append({
                "title": result.get("title", "Untitled"),
                "url": result.get("url", ""),
                "summary": result.get("summary", ""),
                "snippet": result.get("snippet", "")
            })

        # Truncate content for prompt
        sources_json = json.dumps(sources_summary, indent=2)
        if len(sources_json) > 10000:
            # Logic to truncate each summary while preserving structure
            for i in range(len(sources_summary)):
                if len(sources_json) <= 10000:
                    break
                # Gradually truncate summaries
                sources_summary[i]["summary"] = sources_summary[i]["summary"][:500] + "..."
            sources_json = json.dumps(sources_summary, indent=2)

        # Create report prompt
        report_prompt = f"""
        Create a comprehensive {format_type} research report based on the following web research:

        TASK: {prompt}

        RESEARCH QUESTIONS:
        {', '.join(research_questions)}

        SEARCH TERMS USED:
        {', '.join(search_terms)}

        SOURCES AND FINDINGS:
        {sources_json}

        REPORT DETAILS:
        - Format: {template_format}
        - Filename: {output_label}
        - Description: {output_description}

        Create a well-structured report that:
        1. Includes an executive summary of key findings
        2. Addresses each research question directly
        3. Integrates information from all relevant sources
        4. Cites sources appropriately for each piece of information
        5. Provides a comprehensive synthesis of the research
        6. Is formatted professionally and appropriately for {template_format}

        The report should be scholarly, accurate, and focused on the original research task.
        """

        try:
            # Generate report with AI
            report_content = await self.ai_service.call_api([
                {"role": "system", "content": f"You create professional research reports in {template_format} format."},
                {"role": "user", "content": report_prompt}
            ])

            # Convert to HTML if needed
            if format_type == "html" and not report_content.lower().startswith("<html"):
                # Check if it's markdown that needs conversion
                if report_content.startswith("#"):
                    report_content = markdown.markdown(report_content)
                # Wrap in basic HTML structure if needed
                if not report_content.lower().startswith("<html"):
                    report_content = f"<html><head><title>Web Research Results</title></head><body>{report_content}</body></html>"

            return {
                "label": output_label,
                "content": report_content,
                "metadata": {
                    "content_type": content_type
                }
            }

        except Exception as e:
            logger.error(f"Error creating narrative document: {str(e)}")
            # Create error document
            if format_type == "markdown":
                content = f"# Web Research Error\n\nAn error occurred: {str(e)}"
            elif format_type == "html":
                content = f"<html><body><h1>Web Research Error</h1><p>An error occurred: {str(e)}</p></body></html>"
            else:
                content = f"WEB RESEARCH ERROR\n\nAn error occurred: {str(e)}"

            return {
                "label": output_label,
                "content": content,
                "metadata": {
                    "content_type": content_type
                }
            }

    async def _create_json_document(self, prompt: str, results: List[Dict[str, Any]],
                               research_plan: Dict[str, Any], output_label: str) -> Dict[str, Any]:
        """
        Create a JSON document from research results.

        Args:
            prompt: Original research prompt
            results: Research results
            research_plan: Research plan
            output_label: Output filename

        Returns:
            Document object
        """
        try:
            # Create structured data
            sources_data = []
            for result in results:
                sources_data.append({
                    "title": result.get("title", "Untitled"),
                    "url": result.get("url", ""),
                    "summary": result.get("summary", ""),
                    "snippet": result.get("snippet", ""),
                    "source_type": result.get("source_type", "")
                })

            # Create metadata
            metadata = {
                "query": prompt,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                "research_questions": research_plan.get("research_questions", []),
                "search_terms": research_plan.get("search_terms", [])
            }

            # Compile complete report object
            json_content = {
                "metadata": metadata,
                "summary": research_plan.get("feedback", "Web research results"),
                "sources": sources_data
            }

            # Convert to JSON string
            content = json.dumps(json_content, indent=2)

            return {
                "label": output_label,
                "content": content,
                "metadata": {
                    "content_type": "application/json"
                }
            }

        except Exception as e:
            logger.error(f"Error creating JSON document: {str(e)}")
            return {
                "label": output_label,
                "content": json.dumps({"error": str(e)}),
                "metadata": {
                    "content_type": "application/json"
                }
            }

    async def _create_csv_document(self, results: List[Dict[str, Any]], output_label: str) -> Dict[str, Any]:
        """
        Create a CSV document from research results.

        Args:
            results: Research results
            output_label: Output filename

        Returns:
            Document object
        """
        try:
            # Create CSV header
            csv_lines = ["Title,URL,Source Type,Snippet"]

            # Add results
            for result in results:
                # Escape CSV fields
                title = result.get("title", "").replace('"', '""')
                url = result.get("url", "").replace('"', '""')
                source_type = result.get("source_type", "").replace('"', '""')
                snippet = result.get("snippet", "").replace('"', '""')

                csv_lines.append(f'"{title}","{url}","{source_type}","{snippet}"')

            # Combine into CSV content
            content = "\n".join(csv_lines)

            return {
                "label": output_label,
                "content": content,
                "metadata": {
                    "content_type": "text/csv"
                }
            }

        except Exception as e:
            logger.error(f"Error creating CSV document: {str(e)}")
            return {
                "label": output_label,
                "content": "Error,Error\nFailed to create CSV,{0}".format(str(e)),
                "metadata": {
                    "content_type": "text/csv"
                }
            }

    def _determine_format_type(self, output_label: str) -> str:
        """
        Determine the format type based on the filename.

        Args:
            output_label: Output filename

        Returns:
            Format type (markdown, html, text, json, csv)
        """
        output_label_lower = output_label.lower()

        if output_label_lower.endswith(".md"):
            return "markdown"
        elif output_label_lower.endswith(".html"):
            return "html"
        elif output_label_lower.endswith(".txt"):
            return "text"
        elif output_label_lower.endswith(".json"):
            return "json"
        elif output_label_lower.endswith(".csv"):
            return "csv"
        else:
            # Default to markdown
            return "markdown"

    def _search_web(self, query: str) -> List[Dict[str, str]]:
        """
        Conduct a web search and return the results.

        Args:
            query: The search query

        Returns:
            List of search results
        """
        formatted_query = quote_plus(query)
        url = f"{self.search_engine}{formatted_query}"

        search_results_soup = self._read_url(url)
        if not search_results_soup or not search_results_soup.select('.result'):
            logger.warning(f"No search results found for: {query}")
            return []

        # Extract search results
        results = []

        # Find all result containers
        result_elements = search_results_soup.select('.result')

        for result in result_elements:
            # Extract title
            title_element = result.select_one('.result__a')
            title = title_element.text.strip() if title_element else 'No title'

            # Extract URL (DuckDuckGo uses redirects)
            url_element = title_element.get('href') if title_element else ''
            extracted_url = 'No URL'

            if url_element:
                # Extract actual URL from DuckDuckGo's redirect
                if url_element.startswith('/d.js?q='):
                    start = url_element.find('?q=') + 3
                    end = url_element.find('&', start) if '&' in url_element[start:] else None
                    extracted_url = unquote(url_element[start:end])

                    # Ensure URL has correct protocol prefix
                    if not extracted_url.startswith(('http://', 'https://')):
                        if not extracted_url.startswith('//'):
                            extracted_url = 'https://' + extracted_url
                        else:
                            extracted_url = 'https:' + extracted_url
                else:
                    extracted_url = url_element

            # Extract snippet directly from search results page
            snippet_element = result.select_one('.result__snippet')
            snippet = snippet_element.text.strip() if snippet_element else 'No description'

            # Get actual page content
            try:
                target_page_soup = self._read_url(extracted_url)
                content = self._extract_main_content(target_page_soup)
            except Exception as e:
                logger.warning(f"Error extracting content from {extracted_url}: {str(e)}")
                content = f"Error extracting content: {str(e)}"

            results.append({
                'title': title,
                'url': extracted_url,
                'snippet': snippet,
                'data': content
            })

            # Limit number of results
            if len(results) >= self.max_results:
                break

        return results

    def _read_url(self, url: str) -> BeautifulSoup:
        """
        Read a URL and return a BeautifulSoup parser for the content.

        Args:
            url: The URL to read

        Returns:
            BeautifulSoup object with the content or None on errors
        """
        if not url or not url.startswith(('http://', 'https://')):
            return None

        headers = {
            'User-Agent': self.user_agent,
            'Accept': 'text/html,application/xhtml+xml,application/xml',
            'Accept-Language': 'en-US,en;q=0.9',
        }

        try:
            # Initial request
            response = requests.get(url, headers=headers, timeout=self.timeout)

            # Handling for status 202
            if response.status_code == 202:
                # Retry with backoff
                backoff_times = [0.5, 1.0, 2.0, 5.0]

                for wait_time in backoff_times:
                    time.sleep(wait_time)
                    response = requests.get(url, headers=headers, timeout=self.timeout)

                    if response.status_code != 202:
                        break

            # Raise for error status codes
            response.raise_for_status()

            # Parse HTML
            return BeautifulSoup(response.text, 'html.parser')

        except Exception as e:
            logger.error(f"Error reading URL {url}: {str(e)}")
            return None

    def _extract_title(self, soup: BeautifulSoup, url: str) -> str:
        """
        Extract the title from a webpage.

        Args:
            soup: BeautifulSoup object of the webpage
            url: URL of the webpage

        Returns:
            Extracted title
        """
        if not soup:
            return f"Error with {url}"

        # Extract title from title tag
        title_tag = soup.find('title')
        title = title_tag.text.strip() if title_tag else "No title"

        # Alternative: Also look for h1 tags if title tag is missing
        if title == "No title":
            h1_tag = soup.find('h1')
            if h1_tag:
                title = h1_tag.text.strip()

        return title

    def _extract_main_content(self, soup: BeautifulSoup, max_chars: int = 10000) -> str:
        """
        Extract the main content from an HTML page.

        Args:
            soup: BeautifulSoup object of the webpage
            max_chars: Maximum number of characters

        Returns:
            Extracted main content as a string
        """
        if not soup:
            return ""

        # Try to find main content elements in priority order
        main_content = None
        for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
            content = soup.select_one(selector)
            if content:
                main_content = content
                break

        # If no main content found, use the body
        if not main_content:
            main_content = soup.find('body') or soup

        # Remove script, style, nav, footer elements that don't contribute to main content
        for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
            element.extract()

        # Extract text content
        text_content = main_content.get_text(separator=' ', strip=True)

        # Limit to max_chars
        return text_content[:max_chars]

    def _limit_text(self, text: str, max_chars: int = 10000) -> str:
        """
        Limit text to a maximum number of characters.

        Args:
            text: Input text
            max_chars: Maximum number of characters

        Returns:
            Limited text
        """
        if not text:
            return ""

        # If text is already under the limit, return unchanged
        if len(text) <= max_chars:
            return text

        # Otherwise limit text to max_chars
        return text[:max_chars] + "... [Content truncated due to length]"


# Factory function for the Webcrawler agent
def get_webcrawler_agent():
    """Returns an instance of the Webcrawler agent."""
    return AgentWebcrawler()