""" Data analyst agent for analysis and interpretation of data. Optimized for the new task-based processing. """ import logging import json import re import uuid import io import base64 from typing import Dict, Any, List, Optional import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from modules.chat_registry import AgentBase logger = logging.getLogger(__name__) class AgentAnalyst(AgentBase): """Agent for analysis and interpretation of data""" def __init__(self): """Initialize the data analysis agent""" super().__init__() self.name = "analyst" self.description = "Analyzes and interprets data using statistical methods and visualizations" self.capabilities = [ "data_analysis", "pattern_recognition", "statistics", "visualization", "data_interpretation" ] # Visualization settings self.plt_style = 'seaborn-v0_8-whitegrid' self.default_figsize = (10, 6) self.chart_dpi = 100 plt.style.use(self.plt_style) async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]: """ Process a standardized task structure and perform data analysis. Args: task: A dictionary containing: - task_id: Unique ID for this task - prompt: The main instruction for the agent - input_documents: List of documents to process - output_specifications: List of required output documents - context: Additional contextual information Returns: A dictionary containing: - feedback: Text response explaining the analysis results - documents: List of created document objects """ try: # Extract relevant task information prompt = task.get("prompt", "") input_documents = task.get("input_documents", []) output_specs = task.get("output_specifications", []) # Check if AI service is available if not self.ai_service: logger.error("No AI service configured for the Analyst agent") return { "feedback": "The Analyst agent is not properly configured.", "documents": [] } # Extract data from input documents data_frames, document_context = self._extract_data_from_documents(input_documents) # Check if we have analyzable content have_analyzable_content = len(data_frames) > 0 or (prompt and len(prompt.strip()) > 10) if not have_analyzable_content: # Warning if no analyzable content available logger.warning("No analyzable content found") feedback = "I couldn't find any processable data in the provided documents." return { "feedback": feedback, "documents": [] } # Determine analysis type analysis_type = self._determine_analysis_type(prompt) logger.info(f"Performing {analysis_type} analysis") # Store generated documents generated_documents = [] # Extract data insights if DataFrames are available data_insights = "" if data_frames: data_insights = self._extract_data_insights(data_frames) logger.info(f"Extracted insights from {len(data_frames)} datasets") # Generate an appropriate document for each requested output for spec in output_specs: output_label = spec.get("label", "") output_description = spec.get("description", "") # Determine format based on file extension format_type = self._determine_format_type(output_label) # Special handling for visualizations if required if "chart" in output_label.lower() or "plot" in output_label.lower() or "visualization" in output_label.lower() or format_type in ["png", "jpg", "svg"]: # Generate visualization document if data available if data_frames: viz_document = self._generate_visualization_document(data_frames, analysis_type, prompt, output_label) generated_documents.append(viz_document) else: # Fallback if no data generated_documents.append({ "label": output_label, "content": "No data available for visualization." }) else: # Create text-based analysis content = await self._generate_analysis_document( prompt, document_context, data_insights, analysis_type, format_type, output_label, output_description ) generated_documents.append({ "label": output_label, "content": content }) # If no specific outputs requested, create standard documents if not output_specs: # Standard analysis analysis_content = await self._generate_analysis_document( prompt, document_context, data_insights, analysis_type, "markdown", "analysis_report.md", "Analysis report" ) generated_documents.append({ "label": "analysis_report.md", "content": analysis_content }) # Add visualization if data available if data_frames: viz_document = self._generate_visualization_document(data_frames, analysis_type, prompt, "data_visualization.png") generated_documents.append(viz_document) # Create feedback if data_frames: feedback = f"I analyzed {len(data_frames)} datasets and created {len(generated_documents)} documents with the results." else: feedback = f"I performed a text analysis and created {len(generated_documents)} documents with the results." return { "feedback": feedback, "documents": generated_documents } except Exception as e: error_msg = f"Error during data analysis: {str(e)}" logger.error(error_msg) return { "feedback": f"An error occurred during data analysis: {str(e)}", "documents": [] } def _extract_data_from_documents(self, documents: List[Dict[str, Any]]) -> tuple: """ Extract data from input documents. Args: documents: List of input documents Returns: Tuple of (Dictionary of DataFrames, Document context text) """ data_frames = {} document_context = "" for doc in documents: doc_name = doc.get("name", "unnamed") document_context += f"\n\n--- {doc_name} ---\n" for content in doc.get("contents", []): # Extract text content and add to context if content.get("metadata", {}).get("is_text", False): document_context += content.get("data", "") # Try to parse CSV, JSON, or other data files from text if doc_name.lower().endswith('.csv'): try: df = pd.read_csv(io.StringIO(content.get("data", ""))) df = self._preprocess_dataframe(df) data_frames[doc_name] = df logger.info(f"Extracted CSV data from {doc_name}: {df.shape}") except Exception as e: logger.warning(f"Error parsing CSV {doc_name}: {str(e)}") elif doc_name.lower().endswith('.json'): try: json_data = json.loads(content.get("data", "")) if isinstance(json_data, list): df = pd.DataFrame(json_data) elif isinstance(json_data, dict): # Convert nested JSON to DataFrame if any(isinstance(v, list) for v in json_data.values()): # If lists present, try to use them for key, value in json_data.items(): if isinstance(value, list) and len(value) > 0: df = pd.DataFrame(value) break else: continue else: df = pd.DataFrame([json_data]) else: continue df = self._preprocess_dataframe(df) data_frames[doc_name] = df logger.info(f"Extracted JSON data from {doc_name}: {df.shape}") except Exception as e: logger.warning(f"Error parsing JSON {doc_name}: {str(e)}") return data_frames, document_context def _determine_format_type(self, output_label: str) -> str: """ Determine the format type based on the filename. Args: output_label: Output filename Returns: Format type (markdown, html, text, png, etc.) """ output_label_lower = output_label.lower() if output_label_lower.endswith('.md'): return "markdown" elif output_label_lower.endswith('.html'): return "html" elif output_label_lower.endswith('.txt'): return "text" elif output_label_lower.endswith('.json'): return "json" elif output_label_lower.endswith('.csv'): return "csv" elif output_label_lower.endswith('.png'): return "png" elif output_label_lower.endswith('.jpg') or output_label_lower.endswith('.jpeg'): return "jpg" elif output_label_lower.endswith('.svg'): return "svg" else: # Default to markdown return "markdown" def _preprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: """Perform basic preprocessing for a DataFrame""" if df.empty: return df # Remove completely empty rows and columns df = df.dropna(how='all') df = df.dropna(axis=1, how='all') # String conversion to numeric values where appropriate for col in df.columns: # Skip if already numeric if pd.api.types.is_numeric_dtype(df[col]): continue # Skip if predominantly non-numeric strings if df[col].dtype == 'object': # Check if more than 80% of non-NA values could be numeric non_na_values = df[col].dropna() if len(non_na_values) == 0: continue # Attempt conversion to numeric values numeric_count = pd.to_numeric(non_na_values, errors='coerce').notna().sum() if numeric_count / len(non_na_values) > 0.8: # More than 80% can be converted to numeric values df[col] = pd.to_numeric(df[col], errors='coerce') return df def _determine_analysis_type(self, task: str) -> str: """ Determine the analysis type based on the task. Args: task: The analysis task Returns: Analysis type """ # Using universal patterns rather than language-specific keywords task_lower = task.lower() # Check for statistical analysis if "statistical" in task_lower or "stats" in task_lower: return "statistical" # Check for trend analysis elif "trend" in task_lower or "time series" in task_lower: return "trend" # Check for comparative analysis elif "compare" in task_lower or "comparison" in task_lower or "vs" in task_lower: return "comparative" # Check for predictive analysis elif "predict" in task_lower or "forecast" in task_lower: return "predictive" # Check for clustering or categorization elif "cluster" in task_lower or "segment" in task_lower or "classify" in task_lower: return "clustering" # Default: general analysis else: return "general" def _extract_data_insights(self, data_frames: Dict[str, pd.DataFrame]) -> str: """ Extract basic insights from DataFrames. Args: data_frames: Dictionary of DataFrames Returns: Extracted insights as text """ insights = [] for name, df in data_frames.items(): if df.empty: continue insight = f"Dataset: {name}\n" insight += f"Shape: {df.shape[0]} rows, {df.shape[1]} columns\n" insight += f"Columns: {', '.join(df.columns.tolist())}\n" # Basic statistics for numeric columns numeric_cols = df.select_dtypes(include=['number']).columns if len(numeric_cols) > 0: insight += "Statistics for numeric columns:\n" for col in numeric_cols[:5]: # Limit to first 5 columns stats = df[col].describe() insight += f" {col}: min={stats['min']:.2f}, max={stats['max']:.2f}, mean={stats['mean']:.2f}, median={df[col].median():.2f}\n" # Categorical column values cat_cols = df.select_dtypes(include=['object', 'category']).columns if len(cat_cols) > 0: insight += "Categorical columns:\n" for col in cat_cols[:3]: # Limit to first 3 columns # Get top 3 values top_values = df[col].value_counts().head(3) vals_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()]) insight += f" {col}: {df[col].nunique()} unique values. Most common values: {vals_str}\n" insights.append(insight) return "\n\n".join(insights) def _generate_visualization_document(self, data_frames: Dict[str, pd.DataFrame], analysis_type: str, prompt: str, output_label: str) -> Dict[str, Any]: """ Generate a visualization document based on the data and analysis type. Args: data_frames: Dictionary of DataFrames analysis_type: Analysis type prompt: Original task description output_label: Output filename Returns: Visualization document """ # Determine format from filename format_type = output_label.split('.')[-1].lower() if '.' in output_label else 'png' # Set default format if unknown if format_type not in ['png', 'jpg', 'jpeg', 'svg']: format_type = 'png' # Use first DataFrame for visualization if not data_frames: return { "label": output_label, "content": "No data available for visualization." } # Get name and DataFrame of first dataset name, df = next(iter(data_frames.items())) # Create different visualization types based on analysis type and data plt.figure(figsize=self.default_figsize) if analysis_type == "statistical": # Statistical visualization self._create_statistical_visualization(df, name) elif analysis_type == "trend": # Trend visualization self._create_trend_visualization(df, name) elif analysis_type == "comparative": # Comparative visualization self._create_comparative_visualization(df, name) elif analysis_type == "predictive": # Predictive visualization (simple example) self._create_predictive_visualization(df, name) elif analysis_type == "clustering": # Clustering visualization self._create_clustering_visualization(df, name) else: # General visualization self._create_general_visualization(df, name) # Save figure as Base64 string img_data = self._get_figure_as_base64(format_type) plt.close() # Prepare content for document based on format if format_type in ['png', 'jpg', 'jpeg']: content_str = img_data elif format_type == 'svg': # SVG content as text buffer = io.StringIO() plt.savefig(buffer, format='svg') content_str = buffer.getvalue() buffer.close() else: # Fallback to PNG content_str = img_data return { "label": output_label, "content": content_str } def _create_statistical_visualization(self, df: pd.DataFrame, name: str): """Create a statistical visualization for a DataFrame""" # Choose numeric columns for display numeric_cols = df.select_dtypes(include=['number']).columns[:4] # Limit to first 4 if len(numeric_cols) == 0: plt.text(0.5, 0.5, "No numeric data found for statistical visualization", ha='center', va='center', fontsize=12) return # Visualize distribution of first numeric column main_col = numeric_cols[0] # Create histogram with KDE sns.histplot(df[main_col].dropna(), kde=True) plt.title(f'Distribution of {main_col} - {name}') plt.xlabel(main_col) plt.ylabel('Frequency') plt.tight_layout() def _create_trend_visualization(self, df: pd.DataFrame, name: str): """Create a trend visualization for a DataFrame""" # Choose numeric columns for display numeric_cols = df.select_dtypes(include=['number']).columns[:3] # Limit to first 3 if len(numeric_cols) == 0: plt.text(0.5, 0.5, "No numeric data found for trend visualization", ha='center', va='center', fontsize=12) return # Look for date index or use running index date_col = None for col in df.columns: if pd.api.types.is_datetime64_dtype(df[col]) or 'date' in col.lower() or 'time' in col.lower(): date_col = col break # Use date column as X-axis if available if date_col: for col in numeric_cols: plt.plot(df[date_col], df[col], marker='o', linestyle='-', label=col) else: # Otherwise use index numbers for col in numeric_cols: plt.plot(range(len(df)), df[col], marker='o', linestyle='-', label=col) plt.title(f'Trend Analysis - {name}') plt.legend() plt.grid(True) plt.tight_layout() def _create_comparative_visualization(self, df: pd.DataFrame, name: str): """Create a comparative visualization for a DataFrame""" # Choose numeric columns for display numeric_cols = df.select_dtypes(include=['number']).columns[:4] # Limit to first 4 if len(numeric_cols) == 0: plt.text(0.5, 0.5, "No numeric data found for comparative visualization", ha='center', va='center', fontsize=12) return # Find categorical column for grouping categorical_cols = df.select_dtypes(include=['object', 'category']).columns if len(categorical_cols) > 0: category_col = categorical_cols[0] # Display maximum of first 7 categories top_categories = df[category_col].value_counts().head(7).index filtered_df = df[df[category_col].isin(top_categories)] # Create grouped bar chart numeric_col = numeric_cols[0] sns.barplot(x=category_col, y=numeric_col, data=filtered_df) plt.title(f'Comparison of {numeric_col} by {category_col} - {name}') plt.xticks(rotation=45) plt.tight_layout() else: # Comparative visualization for numeric columns without categories if len(numeric_cols) >= 2: # Scatter plot for first two numeric columns sns.scatterplot(x=numeric_cols[0], y=numeric_cols[1], data=df) plt.title(f'Comparison of {numeric_cols[0]} vs {numeric_cols[1]} - {name}') plt.tight_layout() else: # Simple bar chart for a single numeric column plt.bar(range(min(20, len(df))), df[numeric_cols[0]].head(20)) plt.title(f'Top 20 Values for {numeric_cols[0]} - {name}') plt.tight_layout() def _create_predictive_visualization(self, df: pd.DataFrame, name: str): """Create a simple predictive visualization for a DataFrame""" # Choose numeric columns for display numeric_cols = df.select_dtypes(include=['number']).columns[:2] # Limit to first 2 if len(numeric_cols) < 2: plt.text(0.5, 0.5, "At least 2 numeric columns required for predictive visualization", ha='center', va='center', fontsize=12) return # Simple scatter plot with trend line x = df[numeric_cols[0]].values y = df[numeric_cols[1]].values # Linear regression with NumPy valid_indices = ~(np.isnan(x) | np.isnan(y)) if np.sum(valid_indices) > 1: # At least 2 valid data points x_valid = x[valid_indices].reshape(-1, 1) y_valid = y[valid_indices] # Linear regression with NumPy polyfit if len(x_valid) > 1: coeffs = np.polyfit(x_valid.flatten(), y_valid, 1) poly_func = np.poly1d(coeffs) # Create prediction line x_line = np.linspace(np.min(x_valid), np.max(x_valid), 100).reshape(-1, 1) y_pred = poly_func(x_line) # Create scatter plot with trend line plt.scatter(x_valid, y_valid, alpha=0.7) plt.plot(x_line, y_pred, 'r-', linewidth=2) plt.title(f'Linear Regression: {numeric_cols[1]} vs {numeric_cols[0]} - {name}') plt.xlabel(numeric_cols[0]) plt.ylabel(numeric_cols[1]) plt.tight_layout() else: plt.text(0.5, 0.5, "Insufficient data for predictive analysis", ha='center', va='center', fontsize=12) def _create_clustering_visualization(self, df: pd.DataFrame, name: str): """Create a clustering visualization for a DataFrame""" # Choose numeric columns for display numeric_cols = df.select_dtypes(include=['number']).columns[:2] # Limit to first 2 if len(numeric_cols) < 2: plt.text(0.5, 0.5, "At least 2 numeric columns required for clustering visualization", ha='center', va='center', fontsize=12) return # Extract data for first two numeric columns x = df[numeric_cols[0]].values y = df[numeric_cols[1]].values # Find categorical column for color coding categorical_cols = df.select_dtypes(include=['object', 'category']).columns if len(categorical_cols) > 0: # Use first categorical column for color coding category_col = categorical_cols[0] categories = df[category_col].astype('category').cat.codes # Create scatter plot with color coding by category plt.scatter(x, y, c=categories, cmap='viridis', alpha=0.7) plt.colorbar(label=category_col) else: # Simple scatter plot without color coding plt.scatter(x, y, alpha=0.7) plt.title(f'Clustering Visualization: {numeric_cols[1]} vs {numeric_cols[0]} - {name}') plt.xlabel(numeric_cols[0]) plt.ylabel(numeric_cols[1]) plt.tight_layout() def _create_general_visualization(self, df: pd.DataFrame, name: str): """Create a general visualization for a DataFrame""" # Choose numeric columns for display numeric_cols = df.select_dtypes(include=['number']).columns if len(numeric_cols) == 0: plt.text(0.5, 0.5, "No numeric data found for visualization", ha='center', va='center', fontsize=12) return # Create correlation matrix if multiple numeric columns available if len(numeric_cols) >= 2: corr_matrix = df[numeric_cols].corr() sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1) plt.title(f'Correlation Matrix - {name}') else: # Simple distribution for a single numeric column sns.histplot(df[numeric_cols[0]].dropna(), kde=True) plt.title(f'Distribution of {numeric_cols[0]} - {name}') plt.tight_layout() def _get_figure_as_base64(self, format_type: str = 'png') -> str: """ Convert current matplotlib figure to base64 string. Args: format_type: Image format (png, jpg, svg) Returns: Base64 encoded string of the figure """ buffer = io.BytesIO() plt.savefig(buffer, format=format_type, dpi=self.chart_dpi) buffer.seek(0) image_data = buffer.getvalue() buffer.close() # Convert to base64 image_base64 = base64.b64encode(image_data).decode('utf-8') return image_base64 async def _generate_analysis_document(self, prompt: str, context: str, data_insights: str, analysis_type: str, format_type: str, output_label: str, output_description: str) -> str: """ Generate an analysis document based on the data and prompt. Args: prompt: Task description context: Document context as text data_insights: Insights from the data analysis_type: Analysis type format_type: Output format output_label: Output filename output_description: Description of desired output Returns: Generated document content """ if not self.ai_service: return f"# Data Analysis ({analysis_type})\n\nAnalysis could not be generated: AI service not available." # Create specialized prompt based on analysis type system_prompt = f""" You are a specialized data analyst focused on {analysis_type} analyses. Create a detailed analysis of the provided data and/or text content. Your analysis should include: 1. A summary of the data/content 2. Key findings and insights 3. Supporting evidence and calculations 4. Clear conclusions 5. Recommendations where appropriate Format the analysis in the requested output format. """ # Create extended prompt with all available information generation_prompt = f""" Create a detailed {analysis_type} analysis for the following task: TASK: {prompt} CONTEXT: {context if context else 'No additional context available.'} DATA INSIGHTS: {data_insights if data_insights else 'No data insights available.'} OUTPUT REQUIREMENTS: - Filename: {output_label} - Description: {output_description} - Format: {format_type} The analysis should be professional and clearly structured, considering all available information. The output must perfectly match the {format_type} format. """ try: # Call AI for analysis content = await self.ai_service.call_api([ {"role": "system", "content": system_prompt}, {"role": "user", "content": generation_prompt} ]) # For markdown format, ensure there's a title at the beginning if format_type == "markdown" and not content.strip().startswith("# "): content = f"# Data Analysis ({analysis_type})\n\n{content}" return content except Exception as e: logger.error(f"Error generating analysis: {str(e)}") return f"# Data Analysis ({analysis_type})\n\nError generating analysis: {str(e)}" # Factory function for the Analyst agent def get_analyst_agent(): """ Factory function that returns an instance of the Analyst agent. Returns: An instance of the Analyst agent """ return AgentAnalyst()