gateway/modules/chat_agent_analyst.py

"""
Data analyst agent for analysis and interpretation of data.
Focuses on output-first design with AI-powered analysis.
"""

import logging
import json
import io
import base64
from typing import Dict, Any, List
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from modules.chat_registry import AgentBase

logger = logging.getLogger(__name__)

class AgentAnalyst(AgentBase):
    """AI-driven agent for data analysis and visualization"""

    def __init__(self):
        """Initialize the data analysis agent"""
        super().__init__()
        self.name = "analyst"
        self.description = "Analyzes data using AI-powered insights and visualizations, produce diagrams and visualizations"
        self.capabilities = [
            "data_analysis",
            "statistics",
            "visualization",
            "data_interpretation",
            "report_generation"
        ]

        # Set default visualization settings
        plt.style.use('seaborn-v0_8-whitegrid')

    def set_dependencies(self, ai_service=None):
        """Set external dependencies for the agent."""
        self.ai_service = ai_service

    async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process a task by focusing on required outputs and using AI to generate them.

        Args:
            task: Task dictionary with prompt, input_documents, output_specifications

        Returns:
            Dictionary with feedback and documents
        """
        try:
            # Extract task information
            prompt = task.get("prompt", "")
            input_documents = task.get("input_documents", [])
            output_specs = task.get("output_specifications", [])

            # Check AI service
            if not self.ai_service:
                return {
                    "feedback": "The Analyst agent requires an AI service to function.",
                    "documents": []
                }

            # Extract data from documents - focusing only on data_extracted
            datasets, document_context = self._extract_data(input_documents)

            # Generate task analysis to understand what's needed
            analysis_plan = await self._analyze_task(prompt, document_context, datasets, output_specs)

            # Generate all required output documents
            documents = []

            # If no output specs provided, create default analysis outputs
            if not output_specs:
                output_specs = []

            # Process each output specification
            for spec in output_specs:
                output_label = spec.get("label", "")
                output_description = spec.get("description", "")

                # Determine type based on file extension
                output_type = output_label.split('.')[-1].lower() if '.' in output_label else "txt"

                # Generate appropriate content based on output type
                if output_type in ['png', 'jpg', 'jpeg', 'svg']:
                    # Create visualization
                    document = await self._create_visualization(
                        datasets, prompt, output_label, analysis_plan, output_description
                    )
                    documents.append(document)
                elif output_type in ['csv', 'json', 'xlsx']:
                    # Create data document
                    document = await self._create_data_document(
                        datasets, prompt, output_label, analysis_plan, output_description
                    )
                    documents.append(document)
                else:
                    # Create text document (report, analysis, etc.)
                    document = await self._create_text_document(
                        datasets, document_context, prompt, output_label,
                        output_type, analysis_plan, output_description
                    )
                    documents.append(document)

            # Generate feedback
            feedback = f"Analysis complete. Created {len(documents)} documents based on your requirements."
            if analysis_plan.get("key_insights"):
                feedback += f"\n\nKey insights: {analysis_plan.get('key_insights')}"

            return {
                "feedback": feedback,
                "documents": documents
            }

        except Exception as e:
            logger.error(f"Error in analysis: {str(e)}", exc_info=True)
            return {
                "feedback": f"Error during analysis: {str(e)}",
                "documents": []
            }

    def _extract_data(self, documents: List[Dict[str, Any]]) -> tuple:
        """
        Extract data from documents, focusing on data_extracted fields.

        Args:
            documents: List of input documents

        Returns:
            Tuple of (datasets dictionary, document context text)
        """
        datasets = {}
        document_context = ""

        # Process each document
        for doc in documents:
            doc_name = doc.get("name", "unnamed")
            if doc.get("ext"):
                doc_name = f"{doc_name}.{doc.get('ext')}"

            document_context += f"\n\n--- {doc_name} ---\n"

            # Process contents
            for content in doc.get("contents", []):
                # Focus only on data_extracted
                if content.get("data_extracted"):
                    extracted_text = content.get("data_extracted", "")
                    document_context += extracted_text

                    # Try to parse as structured data if appropriate
                    if doc_name.lower().endswith(('.csv', '.tsv')):
                        try:
                            df = pd.read_csv(io.StringIO(extracted_text))
                            datasets[doc_name] = df
                        except:
                            pass
                    elif doc_name.lower().endswith('.json'):
                        try:
                            json_data = json.loads(extracted_text)
                            if isinstance(json_data, list):
                                df = pd.DataFrame(json_data)
                                datasets[doc_name] = df
                            elif isinstance(json_data, dict):
                                # Handle nested JSON structures
                                if any(isinstance(v, list) for v in json_data.values()):
                                    for key, value in json_data.items():
                                        if isinstance(value, list) and len(value) > 0:
                                            df = pd.DataFrame(value)
                                            datasets[f"{doc_name}:{key}"] = df
                                else:
                                    df = pd.DataFrame([json_data])
                                    datasets[doc_name] = df
                        except:
                            pass

                    # Try to detect tabular data in text content
                    if doc_name not in datasets and len(extracted_text.splitlines()) > 2:
                        lines = extracted_text.splitlines()
                        if any(',' in line for line in lines[:5]):
                            try:
                                df = pd.read_csv(io.StringIO(extracted_text))
                                if len(df.columns) > 1:
                                    datasets[doc_name] = df
                            except:
                                pass
                        elif any('\t' in line for line in lines[:5]):
                            try:
                                df = pd.read_csv(io.StringIO(extracted_text), sep='\t')
                                if len(df.columns) > 1:
                                    datasets[doc_name] = df
                            except:
                                pass

        return datasets, document_context

    async def _analyze_task(self, prompt: str, context: str, datasets: Dict, output_specs: List) -> Dict:
        """
        Use AI to analyze the task and create a plan for analysis.

        Args:
            prompt: The task prompt
            context: Document context text
            datasets: Dictionary of extracted datasets
            output_specs: Output specifications

        Returns:
            Analysis plan dictionary
        """
        # Prepare dataset information
        dataset_info = {}
        for name, df in datasets.items():
            try:
                dataset_info[name] = {
                    "shape": df.shape,
                    "columns": df.columns.tolist(),
                    "dtypes": {col: str(df[col].dtype) for col in df.columns},
                    "sample": df.head(3).to_dict(orient='records')
                }
            except:
                dataset_info[name] = {"error": "Could not process dataset"}

        analysis_prompt = f"""
        Analyze this data analysis task and create a plan.

        TASK: {prompt}

        AVAILABLE DATA:
        {json.dumps(dataset_info, indent=2)}

        DOCUMENT CONTEXT:
        {context[:1000]}... (truncated)

        OUTPUT REQUIREMENTS:
        {json.dumps(output_specs, indent=2)}

        Create a detailed analysis plan in JSON format with the following structure:
        {{
            "analysis_type": "statistical|trend|comparative|predictive|cluster|general",
            "key_questions": ["question1", "question2"],
            "recommended_visualizations": [{{
                "type": "chart_type",
                "data_source": "dataset_name",
                "variables": ["col1", "col2"],
                "purpose": "explanation"
            }}],
            "key_insights": "brief summary of initial insights",
            "analysis_approach": "brief description of recommended approach"
        }}

        Only return valid JSON. No preamble or explanations.
        """

        try:
            response = await self.ai_service.call_api([
                {"role": "system", "content": "You are a data analysis expert. Respond with valid JSON only."},
                {"role": "user", "content": analysis_prompt}
            ])

            # Extract JSON from response
            json_start = response.find('{')
            json_end = response.rfind('}') + 1

            if json_start >= 0 and json_end > json_start:
                plan = json.loads(response[json_start:json_end])
                return plan
            else:
                # Fallback if JSON not found
                return {
                    "analysis_type": "general",
                    "key_questions": ["What insights can be extracted from this data?"],
                    "recommended_visualizations": [],
                    "key_insights": "Analysis plan could not be created",
                    "analysis_approach": "General exploratory analysis"
                }

        except Exception as e:
            logger.warning(f"Error creating analysis plan: {str(e)}")
            return {
                "analysis_type": "general",
                "key_questions": ["What insights can be extracted from this data?"],
                "recommended_visualizations": [],
                "key_insights": "Analysis plan could not be created",
                "analysis_approach": "General exploratory analysis"
            }

    async def _create_visualization(self, datasets: Dict, prompt: str, output_label: str,
                               analysis_plan: Dict, description: str) -> Dict:
        """
        Create visualization document using AI guidance.

        Args:
            datasets: Dictionary of datasets
            prompt: Original task prompt
            output_label: Output filename
            analysis_plan: Analysis plan from AI
            description: Output description

        Returns:
            Visualization document
        """
        # Determine format from filename
        format_type = output_label.split('.')[-1].lower()
        if format_type not in ['png', 'jpg', 'jpeg', 'svg']:
            format_type = 'png'

        # If no datasets available, create error message image
        if not datasets:
            plt.figure(figsize=(10, 6))
            plt.text(0.5, 0.5, "No data available for visualization",
                    ha='center', va='center', fontsize=14)
            plt.tight_layout()
            img_data = self._get_image_base64(format_type)
            plt.close()

            return {
                "label": output_label,
                "content": img_data,
                "metadata": {
                    "content_type": f"image/{format_type}"
                }
            }

        # Get recommended visualization from plan
        recommended_viz = analysis_plan.get("recommended_visualizations", [])

        # Prepare dataset info for the first dataset if none specified
        if not recommended_viz and datasets:
            name, df = next(iter(datasets.items()))
            recommended_viz = [{
                "type": "auto",
                "data_source": name,
                "variables": df.columns.tolist()[:5],
                "purpose": "general analysis"
            }]

        # Create visualization code prompt
        viz_prompt = f"""
        Generate Python matplotlib/seaborn code to create a visualization for:

        TASK: {prompt}

        VISUALIZATION REQUIREMENTS:
        - Output format: {format_type}
        - Filename: {output_label}
        - Description: {description}

        RECOMMENDED VISUALIZATION:
        {json.dumps(recommended_viz, indent=2)}

        AVAILABLE DATASETS:
        """

        # Add dataset info for recommended sources
        for viz in recommended_viz:
            data_source = viz.get("data_source")
            if data_source in datasets:
                df = datasets[data_source]
                viz_prompt += f"\nDataset '{data_source}':\n"
                viz_prompt += f"- Shape: {df.shape}\n"
                viz_prompt += f"- Columns: {df.columns.tolist()}\n"
                viz_prompt += f"- Sample data: {df.head(3).to_dict(orient='records')}\n"

        viz_prompt += """
        Generate ONLY Python code that:
        1. Uses matplotlib and/or seaborn to create a clear visualization
        2. Sets figure size to (10, 6)
        3. Includes appropriate titles, labels, and legend
        4. Uses professional color schemes
        5. Handles any missing data gracefully

        Return ONLY executable Python code, no explanations or markdown.
        """

        try:
            # Get visualization code from AI
            viz_code = await self.ai_service.call_api([
                {"role": "system", "content": "You are a data visualization expert. Provide only executable Python code."},
                {"role": "user", "content": viz_prompt}
            ])

            # Clean code
            viz_code = viz_code.replace("```python", "").replace("```", "").strip()

            # Execute visualization code
            plt.figure(figsize=(10, 6))

            # Make local variables available to the code
            local_vars = {
                "plt": plt,
                "sns": sns,
                "pd": pd,
                "np": __import__('numpy')
            }

            # Add datasets to local variables
            for name, df in datasets.items():
                # Create a sanitized variable name
                var_name = ''.join(c if c.isalnum() else '_' for c in name)
                local_vars[var_name] = df

                # Also add with standard names for simpler code
                if "df" not in local_vars:
                    local_vars["df"] = df
                elif "df2" not in local_vars:
                    local_vars["df2"] = df

            # Execute the visualization code
            exec(viz_code, globals(), local_vars)

            # Capture the image
            img_data = self._get_image_base64(format_type)
            plt.close()

            return {
                "label": output_label,
                "content": img_data,
                "metadata": {
                    "content_type": f"image/{format_type}"
                }
            }

        except Exception as e:
            logger.error(f"Error creating visualization: {str(e)}", exc_info=True)

            # Create error message image
            plt.figure(figsize=(10, 6))
            plt.text(0.5, 0.5, f"Visualization error: {str(e)}",
                    ha='center', va='center', fontsize=12)
            plt.tight_layout()
            img_data = self._get_image_base64(format_type)
            plt.close()

            return {
                "label": output_label,
                "content": img_data,
                "metadata": {
                    "content_type": f"image/{format_type}"
                }
            }

    async def _create_data_document(self, datasets: Dict, prompt: str, output_label: str,
                              analysis_plan: Dict, description: str) -> Dict:
        """
        Create a data document (e.g., CSV, JSON) based on analysis.

        Args:
            datasets: Dictionary of datasets
            prompt: Original task prompt
            output_label: Output filename
            analysis_plan: Analysis plan from AI
            description: Output description

        Returns:
            Data document
        """
        # Determine format from filename
        format_type = output_label.split('.')[-1].lower()

        # If no datasets available, return error message
        if not datasets:
            return {
                "label": output_label,
                "content": f"No data available for processing into {format_type} format.",
                "metadata": {
                    "content_type": "text/plain"
                }
            }

        # Generate data processing instructions
        data_prompt = f"""
        Create Python code to process datasets and generate a {format_type} file for:

        TASK: {prompt}

        OUTPUT REQUIREMENTS:
        - Format: {format_type}
        - Filename: {output_label}
        - Description: {description}

        ANALYSIS CONTEXT:
        {json.dumps(analysis_plan, indent=2)}

        AVAILABLE DATASETS:
        """

        # Add dataset info
        for name, df in datasets.items():
            data_prompt += f"\nDataset '{name}':\n"
            data_prompt += f"- Shape: {df.shape}\n"
            data_prompt += f"- Columns: {df.columns.tolist()}\n"
            data_prompt += f"- Sample data: {df.head(3).to_dict(orient='records')}\n"

        data_prompt += """
        Generate Python code that:
        1. Processes the available dataset(s)
        2. Performs necessary transformations, aggregations, or calculations
        3. Outputs the result in the requested format
        4. Returns the content as a string variable named 'result'

        Return ONLY executable Python code, no explanations or markdown.
        """

        try:
            # Get data processing code from AI
            data_code = await self.ai_service.call_api([
                {"role": "system", "content": "You are a data processing expert. Provide only executable Python code."},
                {"role": "user", "content": data_prompt}
            ])

            # Clean code
            data_code = data_code.replace("```python", "").replace("```", "").strip()

            # Setup execution environment
            local_vars = {"pd": pd, "np": __import__('numpy'), "io": io}

            # Add datasets to local variables
            for name, df in datasets.items():
                # Create a sanitized variable name
                var_name = ''.join(c if c.isalnum() else '_' for c in name)
                local_vars[var_name] = df

                # Also add with standard names for simpler code
                if "df" not in local_vars:
                    local_vars["df"] = df
                elif "df2" not in local_vars:
                    local_vars["df2"] = df

            # Execute the code
            exec(data_code, globals(), local_vars)

            # Get the result
            result = local_vars.get("result", "No output was generated.")

            # Determine content type
            content_type = "text/csv" if format_type == "csv" else \
                          "application/json" if format_type == "json" else \
                          "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" if format_type == "xlsx" else \
                          "text/plain"

            return {
                "label": output_label,
                "content": result,
                "metadata": {
                    "content_type": content_type
                }
            }

        except Exception as e:
            logger.error(f"Error creating data document: {str(e)}", exc_info=True)

            return {
                "label": output_label,
                "content": f"Error generating {format_type} document: {str(e)}",
                "metadata": {
                    "content_type": "text/plain"
                }
            }

    async def _create_text_document(self, datasets: Dict, context: str, prompt: str,
                              output_label: str, format_type: str,
                              analysis_plan: Dict, description: str) -> Dict:
        """
        Create a text document (report, analysis, etc.) based on analysis.

        Args:
            datasets: Dictionary of datasets
            context: Document context text
            prompt: Original task prompt
            output_label: Output filename
            format_type: Output format type
            analysis_plan: Analysis plan from AI
            description: Output description

        Returns:
            Text document
        """
        # Create dataset summaries
        dataset_summaries = []
        for name, df in datasets.items():
            summary = f"Dataset: {name}\n"
            summary += f"- Shape: {df.shape[0]} rows, {df.shape[1]} columns\n"
            summary += f"- Columns: {', '.join(df.columns.tolist())}\n"

            # Basic statistics for numeric columns
            numeric_cols = df.select_dtypes(include=['number']).columns
            if len(numeric_cols) > 0:
                summary += "- Numeric Columns Stats:\n"
                for col in numeric_cols[:3]:  # Limit to first 3
                    stats = df[col].describe()
                    summary += f"  - {col}: min={stats['min']:.2f}, max={stats['max']:.2f}, mean={stats['mean']:.2f}\n"

            dataset_summaries.append(summary)

        # Determine content type based on format
        content_type = "text/markdown" if format_type in ["md", "markdown"] else \
                      "text/html" if format_type == "html" else \
                      "text/plain"

        # Generate analysis prompt
        analysis_prompt = f"""
        Create a detailed {format_type} document for:

        TASK: {prompt}

        OUTPUT REQUIREMENTS:
        - Format: {format_type}
        - Filename: {output_label}
        - Description: {description}

        ANALYSIS CONTEXT:
        {json.dumps(analysis_plan, indent=2)}

        DATASET SUMMARIES:
        {"".join(dataset_summaries)}

        DOCUMENT CONTEXT:
        {context[:2000]}... (truncated)

        Create a comprehensive, professional analysis document that addresses the task requirements.
        The document should:
        1. Have a clear structure with headings and sections
        2. Include relevant data findings and insights
        3. Provide appropriate interpretations and recommendations
        4. Format the content according to the required output format

        Your response should be the complete document content in the specified format.
        """

        try:
            # Get document content from AI
            document_content = await self.ai_service.call_api([
                {"role": "system", "content": f"You are a data analysis expert creating a {format_type} document."},
                {"role": "user", "content": analysis_prompt}
            ])

            # Clean HTML or Markdown if needed
            if format_type in ["md", "markdown"] and not document_content.strip().startswith("#"):
                document_content = f"# Analysis Report\n\n{document_content}"
            elif format_type == "html" and not "<html" in document_content.lower():
                document_content = f"<html><body>{document_content}</body></html>"

            return {
                "label": output_label,
                "content": document_content,
                "metadata": {
                    "content_type": content_type
                }
            }

        except Exception as e:
            logger.error(f"Error creating text document: {str(e)}", exc_info=True)

            # Create a simple error document
            if format_type in ["md", "markdown"]:
                content = f"# Error in Analysis\n\nThere was an error generating the analysis: {str(e)}"
            elif format_type == "html":
                content = f"<html><body><h1>Error in Analysis</h1><p>There was an error generating the analysis: {str(e)}</p></body></html>"
            else:
                content = f"Error in Analysis\n\nThere was an error generating the analysis: {str(e)}"

            return {
                "label": output_label,
                "content": content,
                "metadata": {
                    "content_type": content_type
                }
            }

    def _get_image_base64(self, format_type: str = 'png') -> str:
        """
        Convert current matplotlib figure to base64 string.

        Args:
            format_type: Image format

        Returns:
            Base64 encoded string of the image
        """
        buffer = io.BytesIO()
        plt.savefig(buffer, format=format_type, dpi=100)
        buffer.seek(0)
        image_data = buffer.getvalue()
        buffer.close()

        # Convert to base64
        image_base64 = base64.b64encode(image_data).decode('utf-8')
        return image_base64


# Factory function for the Analyst agent
def get_analyst_agent():
    """Returns an instance of the Analyst agent."""
    return AgentAnalyst()