gateway/modules/agents/z_agentAnalyst copy.py
ValueOn AG f3860723af wip
2025-06-08 03:12:43 +02:00

902 lines
No EOL
36 KiB
Python

"""
Data analyst agent for analysis and interpretation of data.
Focuses on output-first design with AI-powered analysis.
"""
import logging
import json
import io
import base64
import os
import time
from typing import Dict, Any, List, Optional
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import hashlib
import uuid
import re
import shutil
from pathlib import Path
import traceback
import sys
import importlib.util
import inspect
from pydantic import BaseModel
from modules.workflow.agentBase import AgentBase
from modules.interfaces.serviceChatModel import ChatContent
logger = logging.getLogger(__name__)
class AgentAnalyst(AgentBase):
"""AI-driven agent for data analysis and visualization"""
def __init__(self):
"""Initialize the data analysis agent"""
super().__init__()
self.name = "analyst"
self.label = "Data Analysis"
self.description = "Analyzes data using AI-powered insights and visualizations, produce diagrams and visualizations"
self.capabilities = [
"dataAnalysis",
"statistics",
"visualization",
"dataInterpretation",
"reportGeneration"
]
# Set default visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
def setDependencies(self, serviceBase=None):
"""Set external dependencies for the agent."""
self.setService(serviceBase)
async def processTask(self, task: Dict[str, Any]) -> Dict[str, Any]:
"""
Process a task by focusing on required outputs and using AI to guide the analysis process.
Args:
task: Task dictionary with prompt, inputDocuments, outputSpecifications
Returns:
Dictionary with feedback and documents
"""
try:
# Extract task information
prompt = task.get("prompt", "")
outputSpecs = task.get("outputSpecifications", [])
workflow = task.get("context", {}).get("workflow", {})
# Check AI service
if not self.service or not self.service.base:
return {
"feedback": "The Analyst agent requires an AI service to function effectively.",
"documents": []
}
# Create analysis plan
if workflow:
self.service.logAdd(workflow, "Extracting data from documents...", level="info", progress=35)
analysisPlan = await self._createAnalysisPlan(prompt)
# Check if this is truly an analysis task
if not analysisPlan.get("requiresAnalysis", True):
return {
"feedback": "This task doesn't appear to require analysis. Please try a different agent.",
"documents": []
}
# Analyze data
if workflow:
self.service.logAdd(workflow, "Analyzing task requirements...", level="info", progress=45)
analysisResults = await self._analyzeData(task, analysisPlan)
# Format results into requested output documents
totalSpecs = len(outputSpecs)
for i, spec in enumerate(outputSpecs):
progress = 50 + int((i / totalSpecs) * 40) # Progress from 50% to 90%
self.service.logAdd(workflow, f"Creating output {i+1}/{totalSpecs}...", level="info", progress=progress)
documents = await self._createOutputDocuments(
prompt,
analysisResults,
outputSpecs,
analysisPlan
)
# Generate feedback
feedback = analysisPlan.get("feedback", f"I analyzed '{prompt[:50]}...' and generated {len(documents)} output documents.")
return {
"feedback": feedback,
"documents": documents
}
except Exception as e:
logger.error(f"Error during analysis: {str(e)}", exc_info=True)
return {
"feedback": f"Error during analysis: {str(e)}",
"documents": []
}
def _extractData(self, documents: List[Dict[str, Any]]) -> tuple:
"""
Extract data from documents, focusing on dataExtracted fields.
Args:
documents: List of input documents
Returns:
Tuple of (datasets dictionary, document context text)
"""
datasets = {}
documentContext = ""
# Process each document
for doc in documents:
docName = doc.get("name", "unnamed")
if doc.get("ext"):
docName = f"{docName}.{doc.get('ext')}"
documentContext += f"\n\n--- {docName} ---\n"
# Process contents
for content in doc.get("contents", []):
# Focus only on dataExtracted
if content.get("dataExtracted"):
extractedText = content.get("dataExtracted", "")
documentContext += extractedText
# Try to parse as structured data if appropriate
if docName.lower().endswith(('.csv', '.tsv')):
try:
df = pd.read_csv(io.StringIO(extractedText))
datasets[docName] = df
except:
pass
elif docName.lower().endswith('.json'):
try:
jsonData = json.loads(extractedText)
if isinstance(jsonData, list):
df = pd.DataFrame(jsonData)
datasets[docName] = df
elif isinstance(jsonData, dict):
# Handle nested JSON structures
if any(isinstance(v, list) for v in jsonData.values()):
for key, value in jsonData.items():
if isinstance(value, list) and len(value) > 0:
df = pd.DataFrame(value)
datasets[f"{docName}:{key}"] = df
else:
df = pd.DataFrame([jsonData])
datasets[docName] = df
except:
pass
# Try to detect tabular data in text content
if docName not in datasets and len(extractedText.splitlines()) > 2:
lines = extractedText.splitlines()
if any(',' in line for line in lines[:5]):
try:
df = pd.read_csv(io.StringIO(extractedText))
if len(df.columns) > 1:
datasets[docName] = df
except:
pass
elif any('\t' in line for line in lines[:5]):
try:
df = pd.read_csv(io.StringIO(extractedText), sep='\t')
if len(df.columns) > 1:
datasets[docName] = df
except:
pass
return datasets, documentContext
async def _analyzeTask(self, prompt: str, documentContext: str, datasets: Dict[str, Any], outputSpecs: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Analyze the task requirements using AI.
Args:
prompt: The task prompt
documentContext: Context from input documents
datasets: Available datasets
outputSpecs: Output specifications
Returns:
Analysis plan dictionary
"""
# Create analysis prompt
analysisPrompt = f"""
Analyze this data analysis task and create a detailed plan:
TASK: {prompt}
DOCUMENT CONTEXT:
{documentContext}
AVAILABLE DATASETS:
{json.dumps(datasets, indent=2)}
REQUIRED OUTPUTS:
{json.dumps(outputSpecs, indent=2)}
Create a detailed analysis plan in JSON format with:
{{
"analysisSteps": [
{{
"step": "step description",
"purpose": "why this step is needed",
"datasets": ["dataset1", "dataset2"],
"techniques": ["technique1", "technique2"],
"outputs": ["output1", "output2"]
}}
],
"visualizations": [
{{
"type": "visualization type",
"purpose": "what it shows",
"datasets": ["dataset1"],
"settings": {{"key": "value"}}
}}
],
"insights": [
{{
"type": "insight type",
"description": "what to look for",
"datasets": ["dataset1"]
}}
],
"feedback": "explanation of the analysis approach"
}}
Respond with ONLY the JSON object, no additional text or explanations.
"""
try:
# Get analysis plan from AI
response = await self.service.base.callAi([
{"role": "system", "content": "You are a data analysis expert. Create detailed analysis plans. Respond with valid JSON only."},
{"role": "user", "content": analysisPrompt}
], produceUserAnswer=True)
# Extract JSON
jsonStart = response.find('{')
jsonEnd = response.rfind('}') + 1
if jsonStart >= 0 and jsonEnd > jsonStart:
plan = json.loads(response[jsonStart:jsonEnd])
return plan
else:
# Fallback plan
logger.warning(f"Not able creating analysis plan, generating fallback plan")
return {
"analysisSteps": [
{
"step": "Basic data analysis",
"purpose": "Understand the data structure and content",
"datasets": list(datasets.keys()),
"techniques": ["summary statistics", "data visualization"],
"outputs": ["summary report", "basic visualizations"]
}
],
"visualizations": [
{
"type": "basic charts",
"purpose": "Show data distribution and relationships",
"datasets": list(datasets.keys()),
"settings": {}
}
],
"insights": [
{
"type": "basic insights",
"description": "Key findings from the data",
"datasets": list(datasets.keys())
}
],
"feedback": f"I'll analyze the data and provide insights about {prompt}"
}
except Exception as e:
logger.warning(f"Error creating analysis plan: {str(e)}")
# Simple fallback plan
return {
"analysisSteps": [
{
"step": "Basic data analysis",
"purpose": "Understand the data structure and content",
"datasets": list(datasets.keys()),
"techniques": ["summary statistics", "data visualization"],
"outputs": ["summary report", "basic visualizations"]
}
],
"visualizations": [
{
"type": "basic charts",
"purpose": "Show data distribution and relationships",
"datasets": list(datasets.keys()),
"settings": {}
}
],
"insights": [
{
"type": "basic insights",
"description": "Key findings from the data",
"datasets": list(datasets.keys())
}
],
"feedback": f"I'll analyze the data and provide insights about {prompt}"
}
async def _createAnalysisPlan(self, prompt: str) -> Dict[str, Any]:
"""
Create an analysis plan based on the task prompt.
Args:
prompt: The task prompt
Returns:
Analysis plan dictionary
"""
try:
# Create analysis prompt
analysisPrompt = f"""
Analyze this data analysis task and create a detailed plan:
TASK: {prompt}
Create a detailed analysis plan in JSON format with:
{{
"requiresAnalysis": true/false,
"analysisSteps": [
{{
"step": "step description",
"purpose": "why this step is needed",
"techniques": ["technique1", "technique2"],
"outputs": ["output1", "output2"]
}}
],
"visualizations": [
{{
"type": "visualization type",
"purpose": "what it shows",
"settings": {{"key": "value"}}
}}
],
"insights": [
{{
"type": "insight type",
"description": "what to look for"
}}
],
"feedback": "explanation of the analysis approach"
}}
Respond with ONLY the JSON object, no additional text or explanations.
"""
# Get analysis plan from AI
response = await self.service.base.callAi([
{"role": "system", "content": "You are a data analysis expert. Create detailed analysis plans. Respond with valid JSON only."},
{"role": "user", "content": analysisPrompt}
], produceUserAnswer=True)
# Extract JSON
jsonStart = response.find('{')
jsonEnd = response.rfind('}') + 1
if jsonStart >= 0 and jsonEnd > jsonStart:
plan = json.loads(response[jsonStart:jsonEnd])
return plan
else:
# Fallback plan
logger.warning(f"Not able creating analysis plan, generating fallback plan")
return {
"requiresAnalysis": True,
"analysisSteps": [
{
"step": "Basic data analysis",
"purpose": "Understand the data structure and content",
"techniques": ["summary statistics", "data visualization"],
"outputs": ["summary report", "basic visualizations"]
}
],
"visualizations": [
{
"type": "basic charts",
"purpose": "Show data distribution and relationships",
"settings": {}
}
],
"insights": [
{
"type": "basic insights",
"description": "Key findings from the data"
}
],
"feedback": f"I'll analyze the data and provide insights about {prompt}"
}
except Exception as e:
logger.warning(f"Error creating analysis plan: {str(e)}")
# Simple fallback plan
return {
"requiresAnalysis": True,
"analysisSteps": [
{
"step": "Basic data analysis",
"purpose": "Understand the data structure and content",
"techniques": ["summary statistics", "data visualization"],
"outputs": ["summary report", "basic visualizations"]
}
],
"visualizations": [
{
"type": "basic charts",
"purpose": "Show data distribution and relationships",
"settings": {}
}
],
"insights": [
{
"type": "basic insights",
"description": "Key findings from the data"
}
],
"feedback": f"I'll analyze the data and provide insights about {prompt}"
}
async def _createVisualization(self, datasets: Dict, prompt: str, outputLabel: str,
analysisPlan: Dict, description: str) -> Dict:
"""
Create a visualization based on the analysis plan.
Args:
datasets: Dictionary of datasets
prompt: Original task prompt
outputLabel: Output file label
analysisPlan: Analysis plan
description: Output description
Returns:
Document dictionary with visualization
"""
try:
# Get visualization recommendations
vizRecommendations = analysisPlan.get("visualizations", [])
if not vizRecommendations:
# Generate visualization recommendations if none provided
self.service.base.logAdd(analysisPlan.get("workflowId"), "Generating visualization recommendations...", level="info", progress=50)
vizPrompt = f"""
Based on this data and task, recommend appropriate visualizations.
TASK: {prompt}
DESCRIPTION: {description}
DATASETS:
{json.dumps({name: {"shape": df.shape, "columns": df.columns.tolist()}
for name, df in datasets.items()}, indent=2)}
Recommend visualizations in JSON format:
{{
"visualizations": [
{{
"type": "chart_type",
"dataSource": "dataset_name",
"variables": ["col1", "col2"],
"purpose": "explanation"
}}
]
}}
"""
response = await self.service.base.callAi([
{"role": "system", "content": "You are a data visualization expert. Recommend appropriate visualizations based on the data and task."},
{"role": "user", "content": vizPrompt}
])
# Extract JSON
jsonStart = response.find('{')
jsonEnd = response.rfind('}') + 1
if jsonStart >= 0 and jsonEnd > jsonStart:
vizData = json.loads(response[jsonStart:jsonEnd])
vizRecommendations = vizData.get("visualizations", [])
# Determine format from filename
formatType = outputLabel.split('.')[-1].lower()
if formatType not in ['png', 'jpg', 'jpeg', 'svg']:
formatType = 'png'
# If no datasets available, create error message image
if not datasets:
plt.figure(figsize=(10, 6))
plt.text(0.5, 0.5, "No data available for visualization",
ha='center', va='center', fontsize=14)
plt.tight_layout()
imgData = self._getImageBase64(formatType)
plt.close()
return {
"label": outputLabel,
"content": imgData,
"metadata": {
"contentType": f"image/{formatType}"
}
}
# Prepare dataset info for the first dataset if none specified
if not vizRecommendations and datasets:
name, df = next(iter(datasets.items()))
vizRecommendations = [{
"type": "auto",
"dataSource": name,
"variables": df.columns.tolist()[:5],
"purpose": "general analysis"
}]
# Create visualization code prompt
vizPrompt = f"""
Generate Python matplotlib/seaborn code to create a visualization for:
TASK: {prompt}
VISUALIZATION REQUIREMENTS:
- Output format: {formatType}
- Filename: {outputLabel}
- Description: {description}
RECOMMENDED VISUALIZATION:
{json.dumps(vizRecommendations, indent=2)}
AVAILABLE DATASETS:
"""
# Add dataset info for recommended sources
for viz in vizRecommendations:
dataSource = viz.get("dataSource")
if dataSource in datasets:
df = datasets[dataSource]
vizPrompt += f"\nDataset '{dataSource}':\n"
vizPrompt += f"- Shape: {df.shape}\n"
vizPrompt += f"- Columns: {df.columns.tolist()}\n"
vizPrompt += f"- Sample data: {df.head(3).to_dict(orient='records')}\n"
vizPrompt += """
Generate ONLY Python code that:
1. Uses matplotlib and/or seaborn to create a clear visualization
2. Sets figure size to (10, 6)
3. Includes appropriate titles, labels, and legend
4. Uses professional color schemes
5. Handles any missing data gracefully
Return ONLY executable Python code, no explanations or markdown.
"""
try:
# Get visualization code from AI
vizCode = await self.service.base.callAi([
{"role": "system", "content": "You are a data visualization expert. Provide only executable Python code."},
{"role": "user", "content": vizPrompt}
], produceUserAnswer = True)
# Clean code
vizCode = vizCode.replace("```python", "").replace("```", "").strip()
# Execute visualization code
plt.figure(figsize=(10, 6))
# Make local variables available to the code
localVars = {
"plt": plt,
"sns": sns,
"pd": pd,
"np": __import__('numpy')
}
# Add datasets to local variables
for name, df in datasets.items():
# Create a sanitized variable name
varName = ''.join(c if c.isalnum() else '_' for c in name)
localVars[varName] = df
# Also add with standard names for simpler code
if "df" not in localVars:
localVars["df"] = df
elif "df2" not in localVars:
localVars["df2"] = df
# Execute the visualization code
exec(vizCode, globals(), localVars)
# Capture the image
imgData = self._getImageBase64(formatType)
plt.close()
return self.formatAgentDocumentOutput(outputLabel, imgData, f"image/{formatType}")
except Exception as e:
logger.error(f"Error creating visualization: {str(e)}", exc_info=True)
# Create error message image
plt.figure(figsize=(10, 6))
plt.text(0.5, 0.5, f"Visualization error: {str(e)}",
ha='center', va='center', fontsize=12)
plt.tight_layout()
imgData = self._getImageBase64(formatType)
plt.close()
return self.formatAgentDocumentOutput(outputLabel, imgData, f"image/{formatType}")
except Exception as e:
logger.error(f"Error creating visualization: {str(e)}", exc_info=True)
# Create error message image
plt.figure(figsize=(10, 6))
plt.text(0.5, 0.5, f"Visualization error: {str(e)}",
ha='center', va='center', fontsize=12)
plt.tight_layout()
imgData = self._getImageBase64(formatType)
plt.close()
return self.formatAgentDocumentOutput(outputLabel, imgData, f"image/{formatType}")
async def _createDataDocument(self, datasets: Dict, prompt: str, outputLabel: str,
analysisPlan: Dict, description: str) -> ChatContent:
"""
Create a data document (CSV, JSON, Excel) from analysis results.
Args:
datasets: Dictionary of datasets
prompt: Original task prompt
outputLabel: Output filename
analysisPlan: Analysis plan
description: Output description
Returns:
ChatContent object
"""
try:
# Determine format from filename
formatType = outputLabel.split('.')[-1].lower() if '.' in outputLabel else "csv"
# Process data based on format
if formatType == "csv":
result = self._convertToCsv(datasets)
elif formatType == "json":
result = json.dumps(datasets, indent=2)
elif formatType == "xlsx":
result = self._convertToExcel(datasets)
else:
result = str(datasets)
# Determine content type
contentType = "text/csv" if formatType == "csv" else \
"application/json" if formatType == "json" else \
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" if formatType == "xlsx" else \
"text/plain"
return self.formatAgentDocumentOutput(outputLabel, result, contentType)
except Exception as e:
logger.error(f"Error creating data document: {str(e)}", exc_info=True)
errorContent = f"Error generating {formatType} document: {str(e)}"
return self.formatAgentDocumentOutput(outputLabel, errorContent, "text/plain")
async def _createTextDocument(self, datasets: Dict, context: str, prompt: str,
outputLabel: str, formatType: str,
analysisPlan: Dict, description: str) -> ChatContent:
"""
Create a text document (markdown, HTML, text) from analysis results.
Args:
datasets: Dictionary of datasets
context: Document context
prompt: Original task prompt
outputLabel: Output filename
formatType: Output format
analysisPlan: Analysis plan
description: Output description
Returns:
ChatContent object
"""
try:
# Generate dataset summaries
datasetSummaries = []
for name, df in datasets.items():
summary = f"\nDataset: {name}\n"
summary += f"Shape: {df.shape}\n"
summary += f"Columns: {', '.join(df.columns)}\n"
if not df.empty:
summary += f"Sample data:\n{df.head(3).to_string()}\n"
datasetSummaries.append(summary)
# Generate analysis prompt
analysisPrompt = f"""
Create a detailed {formatType} document for:
TASK: {prompt}
OUTPUT REQUIREMENTS:
- Format: {formatType}
- Filename: {outputLabel}
- Description: {description}
ANALYSIS CONTEXT:
{json.dumps(analysisPlan, indent=2)}
DATASET SUMMARIES:
{"".join(datasetSummaries)}
DOCUMENT CONTEXT:
{context[:2000]}... (truncated)
Create a comprehensive, professional analysis document that addresses the task requirements.
The document should:
1. Have a clear structure with headings and sections
2. Include relevant data findings and insights
3. Provide appropriate interpretations and recommendations
4. Format the content according to the required output format
Your response should be the complete document content in the specified format.
"""
# Get document content from AI
documentContent = await self.service.base.callAi([
{"role": "system", "content": f"You are a data analysis expert creating a {formatType} document."},
{"role": "user", "content": analysisPrompt}
], produceUserAnswer = True)
# Clean HTML or Markdown if needed
if formatType in ["md", "markdown"] and not documentContent.strip().startswith("#"):
documentContent = f"# Analysis Report\n\n{documentContent}"
elif formatType == "html" and not "<html" in documentContent.lower():
documentContent = f"<html><body>{documentContent}</body></html>"
# Determine content type
contentType = "text/markdown" if formatType in ["md", "markdown"] else \
"text/html" if formatType == "html" else \
"text/plain"
return self.formatAgentDocumentOutput(outputLabel, documentContent, contentType)
except Exception as e:
logger.error(f"Error creating text document: {str(e)}", exc_info=True)
# Create a simple error document
if formatType in ["md", "markdown"]:
content = f"# Error in Analysis\n\nThere was an error generating the analysis: {str(e)}"
elif formatType == "html":
content = f"<html><body><h1>Error in Analysis</h1><p>There was an error generating the analysis: {str(e)}</p></body></html>"
else:
content = f"Error in Analysis\n\nThere was an error generating the analysis: {str(e)}"
return self.formatAgentDocumentOutput(outputLabel, content, contentType)
def _getImageBase64(self, formatType: str = 'png') -> str:
"""
Convert current matplotlib figure to base64 string.
Args:
formatType: Image format
Returns:
Base64 encoded string of the image
"""
buffer = io.BytesIO()
plt.savefig(buffer, format=formatType, dpi=100)
buffer.seek(0)
imageData = buffer.getvalue()
buffer.close()
# Convert to base64
return base64.b64encode(imageData).decode('utf-8')
async def _analyzeData(self, task: Dict[str, Any], analysisPlan: Dict[str, Any]) -> Dict[str, Any]:
"""
Analyze data based on the analysis plan.
Args:
task: Task dictionary with input documents and specifications
analysisPlan: Analysis plan from _createAnalysisPlan
Returns:
Analysis results dictionary
"""
try:
# Extract data from input documents
inputDocuments = task.get("inputDocuments", [])
datasets, documentContext = self._extractData(inputDocuments)
# Get task information
prompt = task.get("prompt", "")
outputSpecs = task.get("outputSpecifications", [])
# Analyze task requirements
analysisResults = await self._analyzeTask(prompt, documentContext, datasets, outputSpecs)
# Add datasets and context to results
analysisResults["datasets"] = datasets
analysisResults["documentContext"] = documentContext
return analysisResults
except Exception as e:
logger.error(f"Error analyzing data: {str(e)}", exc_info=True)
return {
"error": str(e),
"datasets": {},
"documentContext": ""
}
async def _createOutputDocuments(self, prompt: str, analysisResults: Dict[str, Any],
outputSpecs: List[Dict[str, Any]], analysisPlan: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Create output documents based on analysis results.
Args:
prompt: Original task prompt
analysisResults: Results from data analysis
outputSpecs: List of output specifications
analysisPlan: Analysis plan from _createAnalysisPlan
Returns:
List of document objects
"""
documents = []
datasets = analysisResults.get("datasets", {})
documentContext = analysisResults.get("documentContext", "")
# Process each output specification
for spec in outputSpecs:
outputLabel = spec.get("label", "")
outputDescription = spec.get("description", "")
# Determine format from filename
formatType = outputLabel.split('.')[-1].lower() if '.' in outputLabel else "txt"
try:
# Create appropriate document based on format
if formatType in ["png", "jpg", "jpeg", "svg"]:
# Visualization output
document = await self._createVisualization(
datasets, prompt, outputLabel, analysisPlan, outputDescription
)
elif formatType in ["csv", "json", "xlsx"]:
# Data document output
document = await self._createDataDocument(
datasets, prompt, outputLabel, analysisPlan, outputDescription
)
else:
# Text document output (markdown, html, text)
document = await self._createTextDocument(
datasets, documentContext, prompt, outputLabel, formatType,
analysisPlan, outputDescription
)
documents.append(document)
except Exception as e:
logger.error(f"Error creating output document {outputLabel}: {str(e)}", exc_info=True)
# Create error document
errorDoc = self.formatAgentDocumentOutput(
outputLabel,
f"Error creating document: {str(e)}",
"text/plain"
)
documents.append(errorDoc)
return documents
# Factory function for the Analyst agent
def getAgentAnalyst():
"""Returns an instance of the Analyst agent."""
return AgentAnalyst()