diff --git a/config.ini b/config.ini index b59db403..dddd934f 100644 --- a/config.ini +++ b/config.ini @@ -32,6 +32,9 @@ Security_PASSWORD_REQUIRE_SPECIAL = True Security_FAILED_LOGIN_LIMIT = 5 Security_LOCK_DURATION_MINUTES = 30 +# Content Neutralization configuration +Content_Neutralization_ENABLED = False + # Agent Webcrawler configuration Agent_Webcrawler_SERPAPI_ENGINE = google Agent_Webcrawler_SERPAPI_APIKEY = 7304bd34bca767aa52dd3233297e30a9edc0abc57871f702b3f8238b9d3ee7bc diff --git a/modules/agents/agentAnalyst.py b/modules/agents/agentAnalyst.py deleted file mode 100644 index a8bc1637..00000000 --- a/modules/agents/agentAnalyst.py +++ /dev/null @@ -1,1075 +0,0 @@ -""" -Data analyst agent for analysis and interpretation of data. -Focuses on output-first design with AI-powered analysis. -""" - -import logging -import json -import io -import base64 -import os -import time -from typing import Dict, Any, List, Optional -import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns -from datetime import datetime, UTC -import hashlib -import uuid -import re -import shutil -from pathlib import Path -import traceback -import sys -import importlib.util -import inspect -from pydantic import BaseModel - -from modules.workflow.agentBase import AgentBase -from modules.interfaces.serviceChatModel import ( - ChatContent, - ChatMessage, - ChatStat, - AgentResponse, - AgentHandover -) - -logger = logging.getLogger(__name__) - -class AgentAnalyst(AgentBase): - """AI-driven agent for data analysis and visualization""" - - def __init__(self): - """Initialize the data analysis agent""" - super().__init__() - self.name = "analyst" - self.label = "Data Analysis" - self.description = "Analyzes data using AI-powered insights and visualizations, produce diagrams and visualizations" - self.capabilities = [ - "dataAnalysis", - "statistics", - "visualization", - "dataInterpretation", - "reportGeneration" - ] - - # Set default visualization settings - plt.style.use('seaborn-v0_8-whitegrid') - - def setDependencies(self, serviceBase=None): - """Set external dependencies for the agent.""" - self.setService(serviceBase) - - async def processTask(self, handover: AgentHandover) -> AgentResponse: - """ - Process a task by focusing on required outputs and using AI to guide the analysis process. - - Args: - handover: AgentHandover object containing task information - - Returns: - AgentResponse object with execution results - """ - try: - # 1. Initial Analysis & Planning - self.service.logAdd(handover.workflowId, "Starting analysis task...", level="info", progress=10) - - # Generate extraction prompts for each file - extraction_prompts = await self._generateExtractionPrompts( - prompt=handover.promptUserInitial, - documents=handover.documentsUserInitial - ) - - # 2. Parallel Content Extraction with specific prompts - self.service.logAdd(handover.workflowId, "Extracting content from documents...", level="info", progress=20) - - extracted_contents = [] - for doc, extraction_prompt in zip(handover.documentsUserInitial, extraction_prompts): - # Use document service for extraction with specific prompt - content_result = await self.service.document.contentWithPrompt(doc, extraction_prompt) - if content_result: - extracted_contents.append({ - "document": doc, - "content": content_result["content"], - "metadata": content_result["metadata"], - "extraction_prompt": extraction_prompt - }) - - # 3. Analysis & Reflection - self.service.logAdd(handover.workflowId, "Analyzing extracted content...", level="info", progress=50) - - analysis_results = await self._analyzeContent( - prompt=handover.promptUserInitial, - extracted_contents=extracted_contents - ) - - # 4. Response Generation & Handover Update - self.service.logAdd(handover.workflowId, "Generating response...", level="info", progress=80) - - # Create ChatMessage with results - response_message = ChatMessage( - id=str(uuid.uuid4()), - workflowId=handover.workflowId, - agentName=self.name, - message=analysis_results.get("feedback", ""), - role="assistant", - status="completed", - sequenceNr=handover.sequenceNr, - startedAt=handover.startedAt, - finishedAt=datetime.now(UTC).isoformat(), - success=True, - documents=analysis_results.get("documents", []), - stats=ChatStat( - processingTime=analysis_results.get("processing_time"), - tokenCount=analysis_results.get("token_count"), - successRate=1.0 - ) - ) - - # Update handover object - handover.status = "success" - handover.progress = 100.0 - handover.finishedAt = datetime.now(UTC).isoformat() - handover.documentsOutput = analysis_results.get("documents", []) - handover.promptFromFinishedAgent = analysis_results.get("feedback", "") - - return AgentResponse( - success=True, - message=response_message, - performance=analysis_results.get("performance", {}), - progress=100.0 - ) - - except Exception as e: - logger.error(f"Error in analysis task: {str(e)}", exc_info=True) - - # Create error response - error_message = ChatMessage( - id=str(uuid.uuid4()), - workflowId=handover.workflowId, - agentName=self.name, - message=f"Error during analysis: {str(e)}", - role="system", - status="error", - sequenceNr=handover.sequenceNr, - startedAt=handover.startedAt, - finishedAt=datetime.now(UTC).isoformat(), - success=False - ) - - # Update handover with error - handover.status = "failed" - handover.error = str(e) - handover.finishedAt = datetime.now(UTC).isoformat() - - return AgentResponse( - success=False, - message=error_message, - performance={}, - progress=0.0 - ) - - async def _generateExtractionPrompts(self, prompt: str, documents: List[Dict[str, Any]]) -> List[str]: - """ - Generate specific extraction prompts for each document. - - Args: - prompt: The original user prompt - documents: List of documents to process - - Returns: - List of extraction prompts, one for each document - """ - try: - # Create prompt for AI to generate extraction prompts - prompt_generation = f""" - Generate specific extraction prompts for each document based on the user's request. - - USER REQUEST: {prompt} - - DOCUMENTS: - {json.dumps([{ - "name": doc.get("name", ""), - "type": doc.get("type", ""), - "size": doc.get("size", 0) - } for doc in documents], indent=2)} - - For each document, generate a specific extraction prompt that will help extract the most relevant information. - Consider: - 1. The document type and format - 2. The user's original request - 3. What specific information would be most useful - - Return a JSON array of prompts, one for each document: - [ - {{ - "document_name": "name of the document", - "extraction_prompt": "specific prompt for this document" - }} - ] - """ - - # Get AI's response - response = await self.service.base.callAi([ - {"role": "system", "content": "You are an expert at creating precise document extraction prompts."}, - {"role": "user", "content": prompt_generation} - ]) - - # Parse response - prompts_data = json.loads(response) - - # Map prompts to documents - extraction_prompts = [] - for doc in documents: - doc_prompt = next( - (p["extraction_prompt"] for p in prompts_data if p["document_name"] == doc.get("name")), - f"Extract all relevant information from {doc.get('name')} that relates to: {prompt}" - ) - extraction_prompts.append(doc_prompt) - - return extraction_prompts - - except Exception as e: - logger.error(f"Error generating extraction prompts: {str(e)}") - # Fallback to generic prompts - return [f"Extract all relevant information from {doc.get('name')} that relates to: {prompt}" - for doc in documents] - - async def _analyzeContent(self, prompt: str, extracted_contents: List[Dict[str, Any]]) -> Dict[str, Any]: - """ - Analyze the extracted content and generate results. - - Args: - prompt: The original user prompt - extracted_contents: List of extracted content with metadata - - Returns: - Dictionary containing analysis results - """ - try: - # Create analysis prompt - analysis_prompt = f""" - Analyze the following extracted content and provide insights based on the user's request. - - USER REQUEST: {prompt} - - EXTRACTED CONTENT: - {json.dumps([{ - "document": content["document"].get("name", ""), - "content": content["content"], - "extraction_prompt": content["extraction_prompt"] - } for content in extracted_contents], indent=2)} - - Provide a comprehensive analysis that: - 1. Synthesizes information from all documents - 2. Identifies key insights and patterns - 3. Relates findings to the user's request - 4. Suggests potential visualizations or additional analysis - - Format your response as a JSON object with: - {{ - "insights": ["list of key insights"], - "patterns": ["list of identified patterns"], - "recommendations": ["list of recommendations"], - "visualizations": ["list of suggested visualizations"], - "feedback": "summary of findings" - }} - """ - - # Get AI's analysis - response = await self.service.base.callAi([ - {"role": "system", "content": "You are an expert data analyst."}, - {"role": "user", "content": analysis_prompt} - ]) - - # Parse and return results - return json.loads(response) - - except Exception as e: - logger.error(f"Error analyzing content: {str(e)}") - return { - "insights": [], - "patterns": [], - "recommendations": [], - "visualizations": [], - "feedback": f"Error during analysis: {str(e)}" - } - - def _extractData(self, documents: List[Dict[str, Any]]) -> tuple: - """ - Extract data from documents, focusing on dataExtracted fields. - - Args: - documents: List of input documents - - Returns: - Tuple of (datasets dictionary, document context text) - """ - datasets = {} - documentContext = "" - - # Process each document - for doc in documents: - docName = doc.get("name", "unnamed") - if doc.get("ext"): - docName = f"{docName}.{doc.get('ext')}" - - documentContext += f"\n\n--- {docName} ---\n" - - # Process contents - for content in doc.get("contents", []): - # Focus only on dataExtracted - if content.get("dataExtracted"): - extractedText = content.get("dataExtracted", "") - documentContext += extractedText - - # Try to parse as structured data if appropriate - if docName.lower().endswith(('.csv', '.tsv')): - try: - df = pd.read_csv(io.StringIO(extractedText)) - datasets[docName] = df - except: - pass - elif docName.lower().endswith('.json'): - try: - jsonData = json.loads(extractedText) - if isinstance(jsonData, list): - df = pd.DataFrame(jsonData) - datasets[docName] = df - elif isinstance(jsonData, dict): - # Handle nested JSON structures - if any(isinstance(v, list) for v in jsonData.values()): - for key, value in jsonData.items(): - if isinstance(value, list) and len(value) > 0: - df = pd.DataFrame(value) - datasets[f"{docName}:{key}"] = df - else: - df = pd.DataFrame([jsonData]) - datasets[docName] = df - except: - pass - - # Try to detect tabular data in text content - if docName not in datasets and len(extractedText.splitlines()) > 2: - lines = extractedText.splitlines() - if any(',' in line for line in lines[:5]): - try: - df = pd.read_csv(io.StringIO(extractedText)) - if len(df.columns) > 1: - datasets[docName] = df - except: - pass - elif any('\t' in line for line in lines[:5]): - try: - df = pd.read_csv(io.StringIO(extractedText), sep='\t') - if len(df.columns) > 1: - datasets[docName] = df - except: - pass - - return datasets, documentContext - - async def _analyzeTask(self, prompt: str, documentContext: str, datasets: Dict[str, Any], outputSpecs: List[Dict[str, Any]]) -> Dict[str, Any]: - """ - Analyze the task requirements using AI. - - Args: - prompt: The task prompt - documentContext: Context from input documents - datasets: Available datasets - outputSpecs: Output specifications - - Returns: - Analysis plan dictionary - """ - # Create analysis prompt - analysisPrompt = f""" - Analyze this data analysis task and create a detailed plan: - - TASK: {prompt} - - DOCUMENT CONTEXT: - {documentContext} - - AVAILABLE DATASETS: - {json.dumps(datasets, indent=2)} - - REQUIRED OUTPUTS: - {json.dumps(outputSpecs, indent=2)} - - Create a detailed analysis plan in JSON format with: - {{ - "analysisSteps": [ - {{ - "step": "step description", - "purpose": "why this step is needed", - "datasets": ["dataset1", "dataset2"], - "techniques": ["technique1", "technique2"], - "outputs": ["output1", "output2"] - }} - ], - "visualizations": [ - {{ - "type": "visualization type", - "purpose": "what it shows", - "datasets": ["dataset1"], - "settings": {{"key": "value"}} - }} - ], - "insights": [ - {{ - "type": "insight type", - "description": "what to look for", - "datasets": ["dataset1"] - }} - ], - "feedback": "explanation of the analysis approach" - }} - - Respond with ONLY the JSON object, no additional text or explanations. - """ - - try: - # Get analysis plan from AI - response = await self.service.base.callAi([ - {"role": "system", "content": "You are a data analysis expert. Create detailed analysis plans. Respond with valid JSON only."}, - {"role": "user", "content": analysisPrompt} - ], produceUserAnswer=True) - - # Extract JSON - jsonStart = response.find('{') - jsonEnd = response.rfind('}') + 1 - - if jsonStart >= 0 and jsonEnd > jsonStart: - plan = json.loads(response[jsonStart:jsonEnd]) - return plan - else: - # Fallback plan - logger.warning(f"Not able creating analysis plan, generating fallback plan") - return { - "analysisSteps": [ - { - "step": "Basic data analysis", - "purpose": "Understand the data structure and content", - "datasets": list(datasets.keys()), - "techniques": ["summary statistics", "data visualization"], - "outputs": ["summary report", "basic visualizations"] - } - ], - "visualizations": [ - { - "type": "basic charts", - "purpose": "Show data distribution and relationships", - "datasets": list(datasets.keys()), - "settings": {} - } - ], - "insights": [ - { - "type": "basic insights", - "description": "Key findings from the data", - "datasets": list(datasets.keys()) - } - ], - "feedback": f"I'll analyze the data and provide insights about {prompt}" - } - - except Exception as e: - logger.warning(f"Error creating analysis plan: {str(e)}") - # Simple fallback plan - return { - "analysisSteps": [ - { - "step": "Basic data analysis", - "purpose": "Understand the data structure and content", - "datasets": list(datasets.keys()), - "techniques": ["summary statistics", "data visualization"], - "outputs": ["summary report", "basic visualizations"] - } - ], - "visualizations": [ - { - "type": "basic charts", - "purpose": "Show data distribution and relationships", - "datasets": list(datasets.keys()), - "settings": {} - } - ], - "insights": [ - { - "type": "basic insights", - "description": "Key findings from the data", - "datasets": list(datasets.keys()) - } - ], - "feedback": f"I'll analyze the data and provide insights about {prompt}" - } - - async def _createAnalysisPlan(self, prompt: str) -> Dict[str, Any]: - """ - Create an analysis plan based on the task prompt. - - Args: - prompt: The task prompt - - Returns: - Analysis plan dictionary - """ - try: - # Create analysis prompt - analysisPrompt = f""" - Analyze this data analysis task and create a detailed plan: - - TASK: {prompt} - - Create a detailed analysis plan in JSON format with: - {{ - "requiresAnalysis": true/false, - "analysisSteps": [ - {{ - "step": "step description", - "purpose": "why this step is needed", - "techniques": ["technique1", "technique2"], - "outputs": ["output1", "output2"] - }} - ], - "visualizations": [ - {{ - "type": "visualization type", - "purpose": "what it shows", - "settings": {{"key": "value"}} - }} - ], - "insights": [ - {{ - "type": "insight type", - "description": "what to look for" - }} - ], - "feedback": "explanation of the analysis approach" - }} - - Respond with ONLY the JSON object, no additional text or explanations. - """ - - # Get analysis plan from AI - response = await self.service.base.callAi([ - {"role": "system", "content": "You are a data analysis expert. Create detailed analysis plans. Respond with valid JSON only."}, - {"role": "user", "content": analysisPrompt} - ], produceUserAnswer=True) - - # Extract JSON - jsonStart = response.find('{') - jsonEnd = response.rfind('}') + 1 - - if jsonStart >= 0 and jsonEnd > jsonStart: - plan = json.loads(response[jsonStart:jsonEnd]) - return plan - else: - # Fallback plan - logger.warning(f"Not able creating analysis plan, generating fallback plan") - return { - "requiresAnalysis": True, - "analysisSteps": [ - { - "step": "Basic data analysis", - "purpose": "Understand the data structure and content", - "techniques": ["summary statistics", "data visualization"], - "outputs": ["summary report", "basic visualizations"] - } - ], - "visualizations": [ - { - "type": "basic charts", - "purpose": "Show data distribution and relationships", - "settings": {} - } - ], - "insights": [ - { - "type": "basic insights", - "description": "Key findings from the data" - } - ], - "feedback": f"I'll analyze the data and provide insights about {prompt}" - } - - except Exception as e: - logger.warning(f"Error creating analysis plan: {str(e)}") - # Simple fallback plan - return { - "requiresAnalysis": True, - "analysisSteps": [ - { - "step": "Basic data analysis", - "purpose": "Understand the data structure and content", - "techniques": ["summary statistics", "data visualization"], - "outputs": ["summary report", "basic visualizations"] - } - ], - "visualizations": [ - { - "type": "basic charts", - "purpose": "Show data distribution and relationships", - "settings": {} - } - ], - "insights": [ - { - "type": "basic insights", - "description": "Key findings from the data" - } - ], - "feedback": f"I'll analyze the data and provide insights about {prompt}" - } - - async def _createVisualization(self, datasets: Dict, prompt: str, outputLabel: str, - analysisPlan: Dict, description: str) -> Dict: - """ - Create a visualization based on the analysis plan. - - Args: - datasets: Dictionary of datasets - prompt: Original task prompt - outputLabel: Output file label - analysisPlan: Analysis plan - description: Output description - - Returns: - Document dictionary with visualization - """ - try: - # Get visualization recommendations - vizRecommendations = analysisPlan.get("visualizations", []) - - if not vizRecommendations: - # Generate visualization recommendations if none provided - self.service.base.logAdd(analysisPlan.get("workflowId"), "Generating visualization recommendations...", level="info", progress=50) - vizPrompt = f""" - Based on this data and task, recommend appropriate visualizations. - - TASK: {prompt} - DESCRIPTION: {description} - - DATASETS: - {json.dumps({name: {"shape": df.shape, "columns": df.columns.tolist()} - for name, df in datasets.items()}, indent=2)} - - Recommend visualizations in JSON format: - {{ - "visualizations": [ - {{ - "type": "chart_type", - "dataSource": "dataset_name", - "variables": ["col1", "col2"], - "purpose": "explanation" - }} - ] - }} - """ - - response = await self.service.base.callAi([ - {"role": "system", "content": "You are a data visualization expert. Recommend appropriate visualizations based on the data and task."}, - {"role": "user", "content": vizPrompt} - ]) - - # Extract JSON - jsonStart = response.find('{') - jsonEnd = response.rfind('}') + 1 - - if jsonStart >= 0 and jsonEnd > jsonStart: - vizData = json.loads(response[jsonStart:jsonEnd]) - vizRecommendations = vizData.get("visualizations", []) - - # Determine format from filename - formatType = outputLabel.split('.')[-1].lower() - if formatType not in ['png', 'jpg', 'jpeg', 'svg']: - formatType = 'png' - - # If no datasets available, create error message image - if not datasets: - plt.figure(figsize=(10, 6)) - plt.text(0.5, 0.5, "No data available for visualization", - ha='center', va='center', fontsize=14) - plt.tight_layout() - imgData = self._getImageBase64(formatType) - plt.close() - - return { - "label": outputLabel, - "content": imgData, - "metadata": { - "contentType": f"image/{formatType}" - } - } - - # Prepare dataset info for the first dataset if none specified - if not vizRecommendations and datasets: - name, df = next(iter(datasets.items())) - vizRecommendations = [{ - "type": "auto", - "dataSource": name, - "variables": df.columns.tolist()[:5], - "purpose": "general analysis" - }] - - # Create visualization code prompt - vizPrompt = f""" - Generate Python matplotlib/seaborn code to create a visualization for: - - TASK: {prompt} - - VISUALIZATION REQUIREMENTS: - - Output format: {formatType} - - Filename: {outputLabel} - - Description: {description} - - RECOMMENDED VISUALIZATION: - {json.dumps(vizRecommendations, indent=2)} - - AVAILABLE DATASETS: - """ - - # Add dataset info for recommended sources - for viz in vizRecommendations: - dataSource = viz.get("dataSource") - if dataSource in datasets: - df = datasets[dataSource] - vizPrompt += f"\nDataset '{dataSource}':\n" - vizPrompt += f"- Shape: {df.shape}\n" - vizPrompt += f"- Columns: {df.columns.tolist()}\n" - vizPrompt += f"- Sample data: {df.head(3).to_dict(orient='records')}\n" - - vizPrompt += """ - Generate ONLY Python code that: - 1. Uses matplotlib and/or seaborn to create a clear visualization - 2. Sets figure size to (10, 6) - 3. Includes appropriate titles, labels, and legend - 4. Uses professional color schemes - 5. Handles any missing data gracefully - - Return ONLY executable Python code, no explanations or markdown. - """ - - try: - # Get visualization code from AI - vizCode = await self.service.base.callAi([ - {"role": "system", "content": "You are a data visualization expert. Provide only executable Python code."}, - {"role": "user", "content": vizPrompt} - ], produceUserAnswer = True) - - # Clean code - vizCode = vizCode.replace("```python", "").replace("```", "").strip() - - # Execute visualization code - plt.figure(figsize=(10, 6)) - - # Make local variables available to the code - localVars = { - "plt": plt, - "sns": sns, - "pd": pd, - "np": __import__('numpy') - } - - # Add datasets to local variables - for name, df in datasets.items(): - # Create a sanitized variable name - varName = ''.join(c if c.isalnum() else '_' for c in name) - localVars[varName] = df - - # Also add with standard names for simpler code - if "df" not in localVars: - localVars["df"] = df - elif "df2" not in localVars: - localVars["df2"] = df - - # Execute the visualization code - exec(vizCode, globals(), localVars) - - # Capture the image - imgData = self._getImageBase64(formatType) - plt.close() - - return self.formatAgentDocumentOutput(outputLabel, imgData, f"image/{formatType}") - - except Exception as e: - logger.error(f"Error creating visualization: {str(e)}", exc_info=True) - - # Create error message image - plt.figure(figsize=(10, 6)) - plt.text(0.5, 0.5, f"Visualization error: {str(e)}", - ha='center', va='center', fontsize=12) - plt.tight_layout() - imgData = self._getImageBase64(formatType) - plt.close() - - return self.formatAgentDocumentOutput(outputLabel, imgData, f"image/{formatType}") - - except Exception as e: - logger.error(f"Error creating visualization: {str(e)}", exc_info=True) - - # Create error message image - plt.figure(figsize=(10, 6)) - plt.text(0.5, 0.5, f"Visualization error: {str(e)}", - ha='center', va='center', fontsize=12) - plt.tight_layout() - imgData = self._getImageBase64(formatType) - plt.close() - - return self.formatAgentDocumentOutput(outputLabel, imgData, f"image/{formatType}") - - async def _createDataDocument(self, datasets: Dict, prompt: str, outputLabel: str, - analysisPlan: Dict, description: str) -> ChatContent: - """ - Create a data document (CSV, JSON, Excel) from analysis results. - - Args: - datasets: Dictionary of datasets - prompt: Original task prompt - outputLabel: Output filename - analysisPlan: Analysis plan - description: Output description - - Returns: - ChatContent object - """ - try: - # Determine format from filename - formatType = outputLabel.split('.')[-1].lower() if '.' in outputLabel else "csv" - - # Process data based on format - if formatType == "csv": - result = self._convertToCsv(datasets) - elif formatType == "json": - result = json.dumps(datasets, indent=2) - elif formatType == "xlsx": - result = self._convertToExcel(datasets) - else: - result = str(datasets) - - # Determine content type - contentType = "text/csv" if formatType == "csv" else \ - "application/json" if formatType == "json" else \ - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" if formatType == "xlsx" else \ - "text/plain" - - return self.formatAgentDocumentOutput(outputLabel, result, contentType) - - except Exception as e: - logger.error(f"Error creating data document: {str(e)}", exc_info=True) - - errorContent = f"Error generating {formatType} document: {str(e)}" - return self.formatAgentDocumentOutput(outputLabel, errorContent, "text/plain") - - async def _createTextDocument(self, datasets: Dict, context: str, prompt: str, - outputLabel: str, formatType: str, - analysisPlan: Dict, description: str) -> ChatContent: - """ - Create a text document (markdown, HTML, text) from analysis results. - - Args: - datasets: Dictionary of datasets - context: Document context - prompt: Original task prompt - outputLabel: Output filename - formatType: Output format - analysisPlan: Analysis plan - description: Output description - - Returns: - ChatContent object - """ - try: - # Generate dataset summaries - datasetSummaries = [] - for name, df in datasets.items(): - summary = f"\nDataset: {name}\n" - summary += f"Shape: {df.shape}\n" - summary += f"Columns: {', '.join(df.columns)}\n" - if not df.empty: - summary += f"Sample data:\n{df.head(3).to_string()}\n" - datasetSummaries.append(summary) - - # Generate analysis prompt - analysisPrompt = f""" - Create a detailed {formatType} document for: - - TASK: {prompt} - - OUTPUT REQUIREMENTS: - - Format: {formatType} - - Filename: {outputLabel} - - Description: {description} - - ANALYSIS CONTEXT: - {json.dumps(analysisPlan, indent=2)} - - DATASET SUMMARIES: - {"".join(datasetSummaries)} - - DOCUMENT CONTEXT: - {context[:2000]}... (truncated) - - Create a comprehensive, professional analysis document that addresses the task requirements. - The document should: - 1. Have a clear structure with headings and sections - 2. Include relevant data findings and insights - 3. Provide appropriate interpretations and recommendations - 4. Format the content according to the required output format - - Your response should be the complete document content in the specified format. - """ - - # Get document content from AI - documentContent = await self.service.base.callAi([ - {"role": "system", "content": f"You are a data analysis expert creating a {formatType} document."}, - {"role": "user", "content": analysisPrompt} - ], produceUserAnswer = True) - - # Clean HTML or Markdown if needed - if formatType in ["md", "markdown"] and not documentContent.strip().startswith("#"): - documentContent = f"# Analysis Report\n\n{documentContent}" - elif formatType == "html" and not "
{documentContent}" - - # Determine content type - contentType = "text/markdown" if formatType in ["md", "markdown"] else \ - "text/html" if formatType == "html" else \ - "text/plain" - - return self.formatAgentDocumentOutput(outputLabel, documentContent, contentType) - - except Exception as e: - logger.error(f"Error creating text document: {str(e)}", exc_info=True) - - # Create a simple error document - if formatType in ["md", "markdown"]: - content = f"# Error in Analysis\n\nThere was an error generating the analysis: {str(e)}" - elif formatType == "html": - content = f"There was an error generating the analysis: {str(e)}
" - else: - content = f"Error in Analysis\n\nThere was an error generating the analysis: {str(e)}" - - return self.formatAgentDocumentOutput(outputLabel, content, contentType) - - def _getImageBase64(self, formatType: str = 'png') -> str: - """ - Convert current matplotlib figure to base64 string. - - Args: - formatType: Image format - - Returns: - Base64 encoded string of the image - """ - buffer = io.BytesIO() - plt.savefig(buffer, format=formatType, dpi=100) - buffer.seek(0) - imageData = buffer.getvalue() - buffer.close() - - # Convert to base64 - return base64.b64encode(imageData).decode('utf-8') - - async def _analyzeData(self, task: Dict[str, Any], analysisPlan: Dict[str, Any]) -> Dict[str, Any]: - """ - Analyze data based on the analysis plan. - - Args: - task: Task dictionary with input documents and specifications - analysisPlan: Analysis plan from _createAnalysisPlan - - Returns: - Analysis results dictionary - """ - try: - # Extract data from input documents - inputDocuments = task.get("inputDocuments", []) - datasets, documentContext = self._extractData(inputDocuments) - - # Get task information - prompt = task.get("prompt", "") - outputSpecs = task.get("outputSpecifications", []) - - # Analyze task requirements - analysisResults = await self._analyzeTask(prompt, documentContext, datasets, outputSpecs) - - # Add datasets and context to results - analysisResults["datasets"] = datasets - analysisResults["documentContext"] = documentContext - - return analysisResults - - except Exception as e: - logger.error(f"Error analyzing data: {str(e)}", exc_info=True) - return { - "error": str(e), - "datasets": {}, - "documentContext": "" - } - - async def _createOutputDocuments(self, prompt: str, analysisResults: Dict[str, Any], - outputSpecs: List[Dict[str, Any]], analysisPlan: Dict[str, Any]) -> List[Dict[str, Any]]: - """ - Create output documents based on analysis results. - - Args: - prompt: Original task prompt - analysisResults: Results from data analysis - outputSpecs: List of output specifications - analysisPlan: Analysis plan from _createAnalysisPlan - - Returns: - List of document objects - """ - documents = [] - datasets = analysisResults.get("datasets", {}) - documentContext = analysisResults.get("documentContext", "") - - # Process each output specification - for spec in outputSpecs: - outputLabel = spec.get("label", "") - outputDescription = spec.get("description", "") - - # Determine format from filename - formatType = outputLabel.split('.')[-1].lower() if '.' in outputLabel else "txt" - - try: - # Create appropriate document based on format - if formatType in ["png", "jpg", "jpeg", "svg"]: - # Visualization output - document = await self._createVisualization( - datasets, prompt, outputLabel, analysisPlan, outputDescription - ) - elif formatType in ["csv", "json", "xlsx"]: - # Data document output - document = await self._createDataDocument( - datasets, prompt, outputLabel, analysisPlan, outputDescription - ) - else: - # Text document output (markdown, html, text) - document = await self._createTextDocument( - datasets, documentContext, prompt, outputLabel, formatType, - analysisPlan, outputDescription - ) - - documents.append(document) - - except Exception as e: - logger.error(f"Error creating output document {outputLabel}: {str(e)}", exc_info=True) - # Create error document - errorDoc = self.formatAgentDocumentOutput( - outputLabel, - f"Error creating document: {str(e)}", - "text/plain" - ) - documents.append(errorDoc) - - return documents - - -# Factory function for the Analyst agent -def getAgentAnalyst(): - """Returns an instance of the Analyst agent.""" - return AgentAnalyst() \ No newline at end of file diff --git a/modules/agents/agentCoach.py b/modules/agents/agentCoach.py deleted file mode 100644 index 799ed8f5..00000000 --- a/modules/agents/agentCoach.py +++ /dev/null @@ -1,380 +0,0 @@ -""" -Coach agent for answering questions and generating structured content. -Provides direct AI-based responses using extracted data from documents. -""" - -import logging -from typing import Dict, Any, List -import json -from datetime import datetime -import uuid - -from modules.workflow.agentBase import AgentBase -from modules.interfaces.serviceChatModel import Task, ChatDocument, ChatContent - -logger = logging.getLogger(__name__) - -class AgentCoach(AgentBase): - """AI-driven agent for answering questions and generating structured content from extracted data""" - - def __init__(self): - """Initialize the coach agent""" - super().__init__() - self.name = "coach" - self.label = "Coach & Assistant" - self.description = "Answers questions, converts and generates content directly from data without complex processing" - self.capabilities = [ - "dataConversion", - "questionAnswering", - "contentGeneration", - "simpleDataFormatting", - "informationSynthesis", - "directResponse", - "imageInterpretation", - "structuredOutput" - ] - - def setDependencies(self, serviceBase=None): - """Set external dependencies for the agent.""" - self.setService(serviceBase) - - async def processTask(self, task: Task) -> Dict[str, Any]: - """ - Process a task by directly using AI to provide answers or content based on extracted data. - - Args: - task: Task object with prompt, inputDocuments, outputSpecifications - - Returns: - Dictionary with feedback and documents - """ - try: - # Extract task information - prompt = task.prompt - inputDocuments = task.filesInput - outputSpecs = task.filesOutput - - # Check AI service - if not self.service or not self.service.base: - return { - "feedback": "The Coach agent requires an AI service to function.", - "documents": [] - } - - # Collect all extracted data from input documents - documentContext = self._collectExtractedData(inputDocuments) - - # Generate task understanding to guide response creation - taskUnderstanding = await self._analyzeTask(prompt, documentContext) - - # Generate documents based on output specifications - documents = [] - - # If no output specs provided, create a default document - if not outputSpecs: - defaultFormat = taskUnderstanding.get("recommendedFormat", "md") - defaultTitle = taskUnderstanding.get("suggestedFilename", "response") - - outputSpecs = [{ - "label": f"{defaultTitle}.{defaultFormat}", - "description": "Response to your request" - }] - - # Process each output specification - for spec in outputSpecs: - outputLabel = spec.get("label", "output.txt") - outputDescription = spec.get("description", "") - - # Determine format based on file extension - outputFormat = outputLabel.split('.')[-1].lower() if '.' in outputLabel else "txt" - - # Generate document based on format - document = await self._generateDocument( - prompt, - documentContext, - outputLabel, - outputFormat, - outputDescription, - taskUnderstanding - ) - - documents.append(document) - - # Generate feedback - feedback = taskUnderstanding.get("feedback", "I've created content based on your request.") - - return { - "feedback": feedback, - "documents": documents - } - - except Exception as e: - logger.error(f"Error in coach processing: {str(e)}", exc_info=True) - return { - "feedback": f"Error while processing your request: {str(e)}", - "documents": [] - } - - def _collectExtractedData(self, documents: List[ChatDocument]) -> str: - """ - Collect extracted data from input documents. - - Args: - documents: List of input documents - - Returns: - Combined extracted data as text - """ - contextParts = [] - - for doc in documents: - docName = doc.name - if doc.ext: - docName = f"{docName}.{doc.ext}" - - contextParts.append(f"\n\n--- {docName} ---\n") - - # Process contents, focusing on dataExtracted field - for content in doc.contents: - if content.data: - contextParts.append(content.data) - - return "\n".join(contextParts) - - async def _analyzeTask(self, prompt: str, context: str) -> Dict: - """ - Use AI to analyze the task and develop an understanding of what's required. - - Args: - prompt: The task prompt - context: Extracted document data - - Returns: - Task understanding dictionary - """ - analysisPrompt = f""" - Analyze this request to determine the best approach for creating a response. - - REQUEST: {prompt} - - EXTRACTED DATA: - {context[:1500]}... (truncated if longer) - - Create a task analysis in JSON format with the following structure: - {{ - "requestType": "question|content|data|report|description", - "recommendedFormat": "md|txt|html|csv|json", - "suggestedFilename": "appropriate_filename_without_extension", - "contentFocus": "brief description of what to focus on", - "feedback": "brief explanation of how you'll approach this request", - "complexity": "simple|moderate|complex" - }} - - Only return valid JSON. No preamble or explanations. - """ - - try: - # Get task understanding from AI - response = await self.service.base.callAi([ - {"role": "system", "content": "You are a task analysis expert. Respond with valid JSON only."}, - {"role": "user", "content": analysisPrompt} - ]) - - # Extract JSON from response - jsonStart = response.find('{') - jsonEnd = response.rfind('}') + 1 - - if jsonStart >= 0 and jsonEnd > jsonStart: - taskUnderstanding = json.loads(response[jsonStart:jsonEnd]) - return taskUnderstanding - else: - # Fallback if JSON not found - return { - "requestType": "content", - "recommendedFormat": "md", - "suggestedFilename": "response", - "contentFocus": "Addressing the main request", - "feedback": "I've created content based on your request and the provided data.", - "complexity": "moderate" - } - - except Exception as e: - logger.warning(f"Error analyzing task: {str(e)}") - return { - "requestType": "content", - "recommendedFormat": "md", - "suggestedFilename": "response", - "contentFocus": "Addressing the main request", - "feedback": "I've created content based on your request and the provided data.", - "complexity": "moderate" - } - - async def _generateDocument(self, prompt: str, context: str, outputLabel: str, - outputFormat: str, description: str, taskUnderstanding: Dict) -> ChatDocument: - """ - Generate a document based on the request and extracted data. - - Args: - prompt: The task prompt - context: Extracted document data - outputLabel: Output filename - outputFormat: Output format (file extension) - description: Output description - taskUnderstanding: Task understanding from analysis - - Returns: - ChatDocument object - """ - # Determine content type based on format - contentType = self._getContentType(outputFormat) - - # Build prompt based on output format - generationPrompt = f""" - Create a response to the following request in {outputFormat} format: - - REQUEST: {prompt} - - EXTRACTED DATA: - {context} - - OUTPUT REQUIREMENTS: - - Filename: {outputLabel} - - Format: {outputFormat} - - Description: {description} - - Focus on: {taskUnderstanding.get("contentFocus", "Addressing the main request")} - - Guidelines: - 1. Create content that directly addresses the request - 2. Use the extracted data to inform your response - 3. Format the output appropriately for {outputFormat} - 4. Be comprehensive but focused - 5. Include appropriate formatting, structure, and organization - - Only return the content. No explanations or additional text. - """ - - try: - # Get content from AI - content = await self.service.base.callAi([ - {"role": "system", "content": f"You are a content generation expert. Create content in {outputFormat} format."}, - {"role": "user", "content": generationPrompt} - ]) - - # Extract content from code blocks if present - content = self._extractFromCodeBlocks(content, outputFormat) - - # Create document object - return ChatDocument( - id=str(uuid.uuid4()), - name=outputLabel.split('.')[0], - ext=outputFormat, - data=content, - contents=[ - ChatContent( - name="main", - data=content, - summary=description, - metadata={"format": outputFormat} - ) - ] - ) - - except Exception as e: - logger.error(f"Error generating document: {str(e)}") - errorContent = self._createErrorContent(str(e), outputFormat) - return ChatDocument( - id=str(uuid.uuid4()), - name=outputLabel.split('.')[0], - ext=outputFormat, - data=errorContent, - contents=[ - ChatContent( - name="error", - data=errorContent, - summary="Error generating content", - metadata={"format": outputFormat, "error": str(e)} - ) - ] - ) - - def _getContentType(self, outputFormat: str) -> str: - """ - Get content type based on format. - - Args: - outputFormat: Output format - - Returns: - Content type - """ - contentTypeMap = { - "md": "text/markdown", - "markdown": "text/markdown", - "html": "text/html", - "txt": "text/plain", - "text": "text/plain", - "json": "application/json", - "csv": "text/csv", - "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - } - - return contentTypeMap.get(outputFormat, "text/plain") - - def _extractFromCodeBlocks(self, content: str, format: str) -> str: - """ - Extract content from code blocks if present. - - Args: - content: Raw content - format: Expected format - - Returns: - Extracted content - """ - # Check for code blocks - codeBlockStart = f"```{format}" - if codeBlockStart in content: - start = content.find(codeBlockStart) + len(codeBlockStart) - end = content.find("```", start) - if end > start: - return content[start:end].strip() - - # Check for generic code blocks - if "```" in content: - start = content.find("```") + 3 - # Skip format identifier if present - if content[start:].strip() and not content[start:start+1].isalnum(): - start = content.find("\n", start) + 1 - end = content.find("```", start) - if end > start: - return content[start:end].strip() - - return content - - def _createErrorContent(self, errorMessage: str, outputFormat: str) -> str: - """ - Create error content in the appropriate format. - - Args: - errorMessage: Error message - outputFormat: Output format - - Returns: - Formatted error content - """ - if outputFormat == "json": - return json.dumps({"error": errorMessage}) - elif outputFormat == "csv": - return f"error\n{errorMessage}" - elif outputFormat in ["md", "markdown"]: - return f"# Error\n\n{errorMessage}" - elif outputFormat == "html": - return f"{errorMessage}
" - else: - return f"Error: {errorMessage}" - - -# Factory function for the Coach agent -def getAgentCoach(): - """Returns an instance of the Coach agent.""" - return AgentCoach() \ No newline at end of file diff --git a/modules/agents/agentCoder.py b/modules/agents/agentCoder.py deleted file mode 100644 index 8cb4d869..00000000 --- a/modules/agents/agentCoder.py +++ /dev/null @@ -1,1039 +0,0 @@ -""" -Coder agent for generating and executing code. -Provides code generation, execution, and improvement capabilities. -""" - -import logging -from typing import Dict, Any, List, Tuple, Optional -import json -import os -import sys -import subprocess -import tempfile -import shutil -import venv -import importlib.util -from datetime import datetime -import uuid - -from modules.workflow.agentBase import AgentBase -from modules.shared.configuration import APP_CONFIG -from modules.interfaces.serviceChatModel import Task, ChatDocument, ChatContent -from modules.shared.attributeUtils import ModelMixin - -logger = logging.getLogger(__name__) - -class AgentCoder(AgentBase): - """Simplified Agent for developing and executing Python code with integrated executor""" - - def __init__(self): - """Initialize the coder agent""" - super().__init__() - self.name = "coder" - self.label = "Developer and Code Executor" - self.description = "Develops and executes Python code for data processing and automation" - self.capabilities = [ - "code_development", - "data_processing", - "file_processing", - "automation", - "code_execution" - ] - - # Executor settings - self.executorTimeout = int(APP_CONFIG.get("Agent_Coder_EXECUTION_TIMEOUT")) # seconds - self.executionRetryLimit = int(APP_CONFIG.get("Agent_Coder_EXECUTION_RETRY")) # max retries - self.tempDir = None - - def setDependencies(self, serviceBase=None): - """Set external dependencies for the agent.""" - self.setService(serviceBase) - - async def processTask(self, task: Task) -> Dict[str, Any]: - """ - Process a task and perform code development/execution. - First checks if the task can be completed without code execution, - then falls back to code generation if needed. - Enhanced to ensure all generated documents are included in output. - - Args: - task: Task object with prompt, inputDocuments, outputSpecifications - - Returns: - Dictionary with feedback and documents - """ - # 1. Extract task information - prompt = task.prompt - inputDocuments = task.filesInput - outputSpecs = task.filesOutput - - # Check if AI service is available - if not self.service or not self.service.base: - logger.error("No AI service configured for the Coder agent") - return { - "feedback": "The Coder agent is not properly configured.", - "documents": [] - } - - # 2. Extract data from documents in separate categories - documentData = [] # For raw file data (for code execution) - contentData = [] # For content data (later use) - contentExtraction = [] # For AI-extracted data (for quick completion) - - for doc in inputDocuments: - # Create proper filename from name and ext - filename = f"{doc.name}.{doc.ext}" if doc.ext else doc.name - - # Add main document data to documentData if it exists - docData = doc.data - if docData: - isBase64 = True # Assume base64 encoded for document data - documentData.append([filename, docData, isBase64]) - - # Process contents for different uses - if doc.contents: - for content in doc.contents: - contentName = content.name - - # For AI-extracted data (quick completion) - if content.data: - contentExtraction.append({ - "filename": filename, - "contentName": contentName, - "contentData": content.data, - "contentType": content.contentType, - "summary": content.summary - }) - - # For raw content data - if content.data: - rawData = content.data - isBase64 = content.metadata.get('base64Encoded', False) if content.metadata else False - contentData.append({ - "filename": filename, - "contentName": contentName, - "data": rawData, - "isBase64": isBase64, - "contentType": content.contentType - }) - - # Also add to documentData for code execution if not already added - if not docData or docData != rawData: - documentData.append([filename, rawData, isBase64]) - - # 3. Check if task can be completed without code execution - quickCompletion = await self._checkQuickCompletion(prompt, contentExtraction, outputSpecs) - - if quickCompletion and quickCompletion.get("complete") == 1: - logger.info("Task completed without code execution") - return { - "feedback": quickCompletion.get("prompt", "Task completed successfully."), - "documents": quickCompletion.get("documents", []) - } - else: - logger.debug(f"Code to generate, no quick check") - - # If quick completion not possible, continue with code generation and execution - logger.info("Generating code to solve the task") - - # 4. Generate code using AI - code, requirements = await self._generateCode(prompt, outputSpecs) - if not code: - return { - "feedback": "Failed to generate code for the task.", - "documents": [] - } - # Store the original code without document data - original_clean_code = code # Save clean code for later use in improvement - - # 5. Replace the placeholder with actual inputFiles data - documentDataJson = repr(documentData) - codeWithData = code.replace("inputFiles = \"=== JSONLOAD ===\"", f"inputFiles = {documentDataJson}") - - # 6. Execute code with retry logic - retryCount = 0 - maxRetries = self.executionRetryLimit - executionHistory = [] - - while retryCount <= maxRetries: - executionResult = self._executeCode(codeWithData, requirements) - executionHistory.append({ - "attempt": retryCount + 1, - "code": codeWithData, - "result": executionResult - }) - - # Check if execution was successful - if executionResult.get("success", False): - logger.info(f"Code execution succeeded on attempt {retryCount + 1}") - break - - # If we've reached max retries, exit the loop - if retryCount >= maxRetries: - logger.info(f"Reached maximum retry limit ({maxRetries}). Giving up.") - break - - # Log the error and attempt to improve the code - error = executionResult.get("error", "Unknown error") - logger.info(f"Execution attempt {retryCount + 1} failed: {error}. Attempting to improve code.") - - # Generate improved code based on error - improvedCode, improvedRequirements = await self._improveCode( - originalCode=original_clean_code, # Use clean code without document data - error=error, - executionResult=executionResult, - attempt=retryCount + 1, - outputSpecs=outputSpecs - ) - - if improvedCode: - # Inject document data into improved code - original_clean_code = improvedCode # Update clean code for next potential improvement - codeWithData = improvedCode.replace("inputFiles = \"=== JSONLOAD ===\"", f"inputFiles = {documentDataJson}") - requirements = improvedRequirements - logger.info(f"Code improved for retry {retryCount + 2}") - else: - logger.warning("Failed to improve code, using original code for retry") - - retryCount += 1 - - # 7. Process results and create output documents - documents = [] - - # Always add the final code document - documents.append(self.formatAgentDocumentOutput("generated_code.py", codeWithData, "text/plain")) - - # Add execution history document - executionHistoryStr = json.dumps(executionHistory, indent=2) - documents.append(self.formatAgentDocumentOutput("execution_history.json", executionHistoryStr, "application/json")) - - # Enhanced result handling: Create documents based on execution results - fixed for proper content extraction - if executionResult.get("success", False): - resultData = executionResult.get("result") - - # Process results from the result dictionary if available - if isinstance(resultData, dict): - # First, create a mapping of expected output labels to their specs - expectedOutputs = {spec.get("label"): spec for spec in outputSpecs} - createdOutputs = set() - - for label, result_item in resultData.items(): - # Check if result follows the expected structure with nested content - if isinstance(result_item, dict) and "content" in result_item: - # Extract values from the properly structured result - content = result_item.get("content", "") # Extract the inner content - base64Encoded = result_item.get("base64Encoded", False) - contentType = result_item.get("contentType", "text/plain") - - # Check if this label matches one of our expected output documents - # If not, but we haven't created all expected outputs yet, try to map it - finalLabel = label - if label not in expectedOutputs and len(expectedOutputs) > 0: - # Find an unused expected output label - for expectedLabel in expectedOutputs: - if expectedLabel not in createdOutputs: - logger.warning(f"Remapping output '{label}' to expected '{expectedLabel}'") - finalLabel = expectedLabel - break - - # Create document by passing only the content to formatAgentDocumentOutput - doc = self.formatAgentDocumentOutput(finalLabel, content, contentType) - - # Override the base64Encoded flag with the value from the result - # This is needed since formatAgentDocumentOutput might determine a different value - if isinstance(base64Encoded, bool): - doc.base64Encoded = base64Encoded - - documents.append(doc) - createdOutputs.add(finalLabel) - logger.info(f"Created document from result: {finalLabel} ({contentType}, base64={base64Encoded})") - else: - # Not properly structured - log warning - logger.warning(f"Skipping improperly formatted result for '{label}'. Results must include 'content' field.") - else: - # Handle non-dictionary results - logger.warning("Execution result is not a dictionary. Creating a single output document.") - doc = self.formatAgentDocumentOutput("result.txt", str(resultData), "text/plain") - documents.append(doc) - - # 8. Return results - return { - "feedback": "Code execution completed successfully." if executionResult.get("success", False) else f"Code execution failed: {executionResult.get('error', 'Unknown error')}", - "documents": documents - } - - async def _improveCode(self, originalCode: str, error: str, executionResult: Dict[str, Any], attempt: int, outputSpecs: List[Dict[str, Any]] = None) -> Tuple[str, List[str]]: - """ - Improve code based on execution error. - Enhanced to maintain proper output handling with correct document structure. - - Args: - originalCode: The code that failed to execute - error: The error message - executionResult: Complete execution result dictionary - attempt: Current attempt number - outputSpecs: List of expected output specifications - - Returns: - Tuple of (improvedCode, requirements) - """ - # Create a string with output specifications to be included in the prompt - outputSpecsStr = "" - if outputSpecs: - outputSpecsStr = "\nEXPECTED OUTPUT DOCUMENTS:\n" - for i, spec in enumerate(outputSpecs, 1): - label = spec.get("label", f"output{i}.txt") - description = spec.get("description", "") - outputSpecsStr += f"{i}. {label} - {description}\n" - - # Create prompt for code improvement - improvementPrompt = f""" -Fix the following Python code that failed during execution. This is attempt {attempt} to fix the code. - -ORIGINAL CODE: -{originalCode} - -ERROR MESSAGE: -{error} - -STDOUT: -{executionResult.get('output', '')} -{outputSpecsStr} -INSTRUCTIONS: -1. Fix all errors identified in the error message -2. If there is a requirements error for missing or failes modules, then create alternate code with other modules -3. Diagnose and fix any logical issues -4. Pay special attention to: -- Type conversions and data handling -- Error handling and edge cases -- Resource management (file handles, etc.) -- Syntax errors and typos -5. Keep the inputFiles handling logic intact -6. Maintain the same overall structure and purpose - -OUTPUT REQUIREMENTS (VERY IMPORTANT): -- Your code MUST define a 'result' variable as a dictionary to store ALL outputs -- The key for each entry MUST be the full filename with extension (e.g., "output.txt") -- The value for each entry MUST be a dictionary with the following structure: -{{ - "content": string, # The actual content (text or base64-encoded string) - "base64Encoded": boolean, # Set to true for binary data, false for text data - "contentType": string # MIME type of the content (e.g., "text/plain", "application/json") -}} -- Example result dictionary: -result = {{ - "output.txt": {{ - "content": "This is text content", - "base64Encoded": False, - "contentType": "text/plain" - }}, - "chart.png": {{ - "content": "base64encodedstring...", - "base64Encoded": True, - "contentType": "image/png" - }} -}} -- NEVER write files to disk using open() or similar methods - use the result dictionary instead - -JSON OUTPUT (CRITICAL): -- After creating the result dictionary, you MUST print it as JSON to stdout -- Make sure your code includes: print(json.dumps(result)) as the final line -- This printed JSON is how the system captures your result - -REQUIREMENTS: -Required packages should be specified as: -# REQUIREMENTS: library==version,library2>=version -- You may add/remove requirements as needed to fix the code - -Return ONLY Python code without explanations or markdown. -""" - - # Call AI service - messages = [ - {"role": "system", "content": "You are an expert Python code debugger. Provide only fixed Python code without explanations or formatting. Ensure all generated files are included in the 'result' dictionary and that result is printed as JSON with print(json.dumps(result))."}, - {"role": "user", "content": improvementPrompt} - ] - - try: - improvedContent = await self.service.base.callAi(messages, temperature=0.2) - - # Extract code and requirements - improvedCode = self._cleanCode(improvedContent) - - # Extract requirements - requirements = [] - for line in improvedCode.split('\n'): - if line.strip().startswith("# REQUIREMENTS:"): - reqStr = line.replace("# REQUIREMENTS:", "").strip() - requirements = [r.strip() for r in reqStr.split(',') if r.strip()] - break - - return improvedCode, requirements - except Exception as e: - logger.error(f"Error improving code: {str(e)}") - return None, [] - - - async def _checkQuickCompletion(self, prompt: str, contentExtraction: List[ChatDocument], outputSpecs: List[Dict[str, Any]]) -> Dict[str, Any]: - """ - Check if the task can be completed without writing and executing code. - - Args: - prompt: The task prompt - contentExtraction: List of extracted content data with contentName and dataExtracted - outputSpecs: List of output specifications - - Returns: - Dictionary with completion status and results, or None if no quick completion - """ - # If no data or no output specs, can't do a quick completion - if not contentExtraction or not outputSpecs: - return None - - # Create a prompt for the AI to check if this can be completed directly - specsJson = json.dumps(outputSpecs) - dataJson = json.dumps([doc.dict() for doc in contentExtraction]) - - checkPrompt = f""" -Analyze this task and determine if it can be completed directly without writing code. - -TASK: -{prompt} - -EXTRACTED DATA AVAILABLE: -{dataJson} - -Each entry in the extracted data contains: -- filename: The source file name -- contentName: The specific content section name -- contentData: The AI-extracted text from the content -- contentType: The type of content (text, csv, etc.) -- summary: A brief summary of the content - -REQUIRED OUTPUT: -{specsJson} - -If the task can be completed directly with the available extracted data, respond with: -{{"complete": 1, "prompt": "Brief explanation of the solution", "documents": [ - {{"label": "filename.ext", "content": "content here"}} -]}} - -If code would be needed to properly complete this task, respond with: -{{"complete": 0, "prompt": "Explanation why code is needed"}} - -Only return valid JSON. Your entire response must be parseable as JSON. -""" - - # Call AI service - logger.debug(f"Checking if task can be completed without code execution: {checkPrompt}") - messages = [ - {"role": "system", "content": "You are an AI assistant that determines if tasks require code execution. Reply with JSON only."}, - {"role": "user", "content": checkPrompt} - ] - - try: - # Use a lower temperature for more deterministic response - response = await self.service.base.callAi(messages, produceUserAnswer = True, temperature=0.1) - - # Parse response as JSON - if response: - try: - # Find JSON in response if there's any text around it - jsonStart = response.find('{') - jsonEnd = response.rfind('}') + 1 - - if jsonStart >= 0 and jsonEnd > jsonStart: - jsonStr = response[jsonStart:jsonEnd] - result = json.loads(jsonStr) - - # Check if this is a proper response - if "complete" in result: - return result - - except json.JSONDecodeError: - logger.debug("Failed to parse quick completion response as JSON") - pass - except Exception as e: - logger.debug(f"Error during quick completion check: {str(e)}") - - # Default to requiring code execution - return None - - async def _generateCode(self, prompt: str, outputSpecs: List[ChatDocument] = None) -> Tuple[str, List[str]]: - """ - Generate Python code from a prompt with the inputFiles placeholder. - Enhanced to emphasize proper result output handling with correct document structure. - - Args: - prompt: The task prompt - outputSpecs: List of expected output specifications - - Returns: - Tuple of (code, requirements) - """ - # Create a string with output specifications to be included in the prompt - outputSpecsStr = "" - if outputSpecs: - outputSpecsStr = "\nEXPECTED OUTPUT DOCUMENTS:\n" - for i, spec in enumerate(outputSpecs, 1): - label = spec.get("label", f"output{i}.txt") - description = spec.get("description", "") - outputSpecsStr += f"{i}. {label} - {description}\n" - - # Create improved prompt for code generation - aiPrompt = f""" -Generate Python code to solve the following task: - -TASK: -{prompt} -{outputSpecsStr} -INPUT FILES: -- 'inputFiles' variable is provided as [[filename, data, isBase64], ...] -- For text files (isBase64=False): use data directly as string -- For binary files (isBase64=True): use base64.b64decode(data) - -OUTPUT REQUIREMENTS (VERY IMPORTANT): -- Your code MUST define a 'result' variable as a dictionary to store ALL outputs -- The key for each entry MUST be the full filename with extension (e.g., "output.txt") -- The value for each entry MUST be a dictionary with the following structure: -{{ - "content": string, # The actual content (text or base64-encoded string) - "base64Encoded": boolean, # Set to true for binary data, false for text data - "contentType": string # MIME type of the content (e.g., "text/plain", "application/json") -}} -- Example result dictionary: -result = {{ - "output.txt": {{ - "content": "This is text content", - "base64Encoded": False, - "contentType": "text/plain" - }}, - "chart.png": {{ - "content": "base64encodedstring...", - "base64Encoded": True, - "contentType": "image/png" - }} -}} -- NEVER write files to disk using open() or similar methods - use the result dictionary instead -- If you generate any charts, reports, or visualizations, ensure they are properly encoded and included - -IMPORTANT - USE EXACT OUTPUT FILENAMES: -- You MUST use the EXACT filenames specified in EXPECTED OUTPUT DOCUMENTS section -- The key in the result dictionary must match these filenames precisely -- If no output documents are specified, use appropriate descriptive filenames - -JSON OUTPUT (CRITICAL): -- After creating the result dictionary, you MUST print it as JSON to stdout using json.dumps() -- Add these lines at the end of your code: - import json # if not already imported - print(json.dumps(result)) -- This printed JSON is how the system captures your result -- Make sure this is the last thing your code prints - -BINARY DATA HANDLING: -- For binary content (images, PDFs, etc.), convert to base64 string and set base64Encoded=True -- For text content (text, JSON, HTML, etc.), use plain string and set base64Encoded=False -- Use appropriate MIME types for different content types - -CODE QUALITY: -- Use explicit type conversions where needed (int/float/str) -- Implement feature detection, not version checks -- Handle errors gracefully with appropriate fallbacks -- Follow latest API conventions for libraries -- Validate inputs before processing - -Your code must start with: -inputFiles = "=== JSONLOAD ===" # DO NOT CHANGE THIS LINE - -REQUIREMENTS: -Required packages should be specified as: -# REQUIREMENTS: library==version,library2>=version -- Specify exact versions for critical libraries -- Use constraint operators (==,>=,<=) as needed - -Return ONLY Python code without explanations or markdown. -""" - - # Call AI service - messages = [ - {"role": "system", "content": "You are a Python code generator. Provide only valid Python code without explanations or formatting. Always output the result dictionary as JSON using print(json.dumps(result)) at the end of your code."}, - {"role": "user", "content": aiPrompt} - ] - - generatedContent = await self.service.base.callAi(messages, temperature=0.1) - - # Extract code and requirements - code = self._cleanCode(generatedContent) - - # Extract requirements - requirements = [] - for line in code.split('\n'): - if line.strip().startswith("# REQUIREMENTS:"): - reqStr = line.replace("# REQUIREMENTS:", "").strip() - requirements = [r.strip() for r in reqStr.split(',') if r.strip()] - break - - return code, requirements - - def _executeCodeProd(self, code: str, requirements: List[str] = None) -> Dict[str, Any]: - """ - Execute Python code in Azure environment using the antenv interpreter. - Optimized for production use in Azure Web App environment where venv creation fails. - - Args: - code: Python code to execute - requirements: List of required packages - - Returns: - Execution result dictionary - """ - try: - # 1. Create temp directory for code files - self.tempDir = tempfile.mkdtemp(prefix="code_exec_") - - # Try different possible paths to find the antenv Python interpreter - possible_python_paths = [ - "/home/site/wwwroot/antenv/bin/python", - "/antenv/bin/python", - "/tmp/8dd8c226509f116/antenv/bin/python", # Path from your error logs - sys.executable # Fallback to system Python - ] - - pythonExe = None - for path in possible_python_paths: - if os.path.exists(path): - pythonExe = path - logger.info(f"Found Python interpreter at: {pythonExe}") - break - - if not pythonExe: - logger.error("Could not find a valid Python interpreter in Azure environment") - return { - "success": False, - "output": "", - "error": "Could not find a valid Python interpreter in Azure environment", - "result": None, - "exitCode": -1 - } - - # 2. Install requirements to a temporary user directory if provided - if requirements: - logger.info(f"Installing requirements in Azure environment: {requirements}") - - # Create requirements.txt - reqFile = os.path.join(self.tempDir, "requirements.txt") - with open(reqFile, "w") as f: - f.write("\n".join(requirements)) - - # Set up a custom PYTHONUSERBASE to isolate package installations - custom_user_base = os.path.join(self.tempDir, "pip_packages") - os.makedirs(custom_user_base, exist_ok=True) - - env = os.environ.copy() - env["PYTHONUSERBASE"] = custom_user_base - - # Install requirements to the custom user directory - try: - pipResult = subprocess.run( - [pythonExe, "-m", "pip", "install", "--user", "-r", reqFile], - capture_output=True, - text=True, - env=env, - timeout=int(APP_CONFIG.get("Agent_Coder_INSTALL_TIMEOUT")) - ) - - if pipResult.returncode != 0: - logger.warning(f"Error installing requirements in Azure: {pipResult.stderr}") - else: - logger.info(f"Requirements installed successfully to {custom_user_base}") - - # Try to find the site-packages directory - import glob - site_packages = os.path.join(custom_user_base, "lib", "python*", "site-packages") - site_packages_paths = glob.glob(site_packages) - - if site_packages_paths: - env["PYTHONPATH"] = os.pathsep.join([site_packages_paths[0], env.get("PYTHONPATH", "")]) - logger.info(f"Added {site_packages_paths[0]} to PYTHONPATH") - else: - # Alternative paths for different Python versions - alt_site_packages = os.path.join(custom_user_base, "site-packages") - if os.path.exists(alt_site_packages): - env["PYTHONPATH"] = os.pathsep.join([alt_site_packages, env.get("PYTHONPATH", "")]) - logger.info(f"Added {alt_site_packages} to PYTHONPATH") - except Exception as e: - logger.warning(f"Exception during requirements installation in Azure: {str(e)}") - else: - env = os.environ.copy() - - # 3. Write code to file - codeFile = os.path.join(self.tempDir, "code.py") - with open(codeFile, "w", encoding="utf-8") as f: - f.write(code) - - # 4. Execute code with the modified environment - logger.debug(f"Executing code in Azure environment with timeout of {self.executorTimeout} seconds") - process = subprocess.run( - [pythonExe, codeFile], - timeout=self.executorTimeout, - capture_output=True, - text=True, - env=env - ) - - # 5. Process results - stdout = process.stdout - stderr = process.stderr - - # Try to extract result from stdout - resultData = None - if process.returncode == 0: - try: - # Find the last line that might be JSON - jsonLines = [] - for line in stdout.strip().split('\n'): - line = line.strip() - if line and line[0] in '{[' and line[-1] in '}]': - try: - parsed = json.loads(line) - jsonLines.append((line, parsed)) - except json.JSONDecodeError: - continue - - # Use the last valid JSON that appears to be a dictionary - if jsonLines: - for line, parsed in reversed(jsonLines): - if isinstance(parsed, dict): - resultData = parsed - logger.debug(f"Extracted result data from stdout: {type(resultData)}") - break - except Exception as e: - logger.debug(f"Error extracting result from stdout: {str(e)}") - - # Enhanced logging of what was found - if resultData: - logger.info(f"Found result dictionary with {len(resultData)} entries: {list(resultData.keys())}") - else: - logger.warning("No result dictionary found in output") - - # Create result dictionary - return { - "success": process.returncode == 0, - "output": stdout, - "error": stderr if process.returncode != 0 else "", - "result": resultData, - "exitCode": process.returncode - } - - except subprocess.TimeoutExpired: - logger.error(f"Execution in Azure timed out after {self.executorTimeout} seconds") - return { - "success": False, - "output": "", - "error": f"Execution timed out after {self.executorTimeout} seconds", - "result": None, - "exitCode": -1 - } - except Exception as e: - logger.error(f"Execution error in Azure environment: {str(e)}") - return { - "success": False, - "output": "", - "error": f"Execution error in Azure environment: {str(e)}", - "result": None, - "exitCode": -1 - } - finally: - # Clean up resources - self._cleanupExecution() - - def _executeCodeVenv(self, code: str, requirements: List[str] = None) -> Dict[str, Any]: - """ - Execute Python code in a virtual environment. - Original implementation with venv creation for non-Azure environments. - - Args: - code: Python code to execute - requirements: List of required packages - - Returns: - Execution result dictionary - """ - try: - # 1. Create temp directory and virtual environment - self.tempDir = tempfile.mkdtemp(prefix="code_exec_") - venvPath = os.path.join(self.tempDir, "venv") - - # Create venv - logger.debug(f"Creating virtual environment at {venvPath}") - - try: - # First try with sys.executable - the standard approach - subprocess.run([sys.executable, "-m", "venv", venvPath], - check=True, capture_output=True, timeout=60) - logger.debug("Virtual environment created successfully with sys.executable") - except (subprocess.SubprocessError, subprocess.CalledProcessError) as e: - logger.warning(f"Failed to create venv with sys.executable: {str(e)}") - - # Fallback method 1: Try with explicit 'python3' command - try: - logger.debug("Trying to create virtual environment with python3 command") - subprocess.run(["python3", "-m", "venv", venvPath], - check=True, capture_output=True, timeout=60) - logger.debug("Virtual environment created successfully with python3") - except (subprocess.SubprocessError, subprocess.CalledProcessError) as e: - logger.warning(f"Failed to create venv with python3: {str(e)}") - - # Fallback method 2: Try with virtualenv instead of venv - try: - logger.debug("Trying to create virtual environment with virtualenv module") - subprocess.run([sys.executable, "-m", "pip", "install", "virtualenv"], - check=False, capture_output=True, timeout=60) - subprocess.run([sys.executable, "-m", "virtualenv", venvPath], - check=True, capture_output=True, timeout=60) - logger.debug("Virtual environment created successfully with virtualenv") - except (subprocess.SubprocessError, subprocess.CalledProcessError) as e: - # If all methods fail, raise an exception - error_msg = f"Failed to create virtual environment with all methods: {str(e)}" - logger.error(error_msg) - raise RuntimeError(error_msg) - - # Get Python executable path - adjusted for OS - if os.name == 'nt': # Windows - pythonExe = os.path.join(venvPath, "Scripts", "python.exe") - else: # Linux/Mac - pythonExe = os.path.join(venvPath, "bin", "python") - - # Verify python executable exists - if not os.path.exists(pythonExe): - # Try to find it - if os.name == 'nt': - possible_paths = [ - os.path.join(venvPath, "Scripts", "python.exe"), - os.path.join(venvPath, "Scripts", "python") - ] - else: - possible_paths = [ - os.path.join(venvPath, "bin", "python"), - os.path.join(venvPath, "bin", "python3") - ] - - for path in possible_paths: - if os.path.exists(path): - pythonExe = path - logger.debug(f"Found Python executable at: {pythonExe}") - break - - if not os.path.exists(pythonExe): - logger.error(f"Python executable not found at expected path: {pythonExe}") - raise FileNotFoundError(f"Python executable not found in virtual environment") - - # 2. Install requirements if provided - if requirements: - logger.info(f"Installing requirements: {requirements}") - - # Create requirements.txt - reqFile = os.path.join(self.tempDir, "requirements.txt") - with open(reqFile, "w") as f: - f.write("\n".join(requirements)) - - x="\n".join(requirements) - logger.info(f"Requirements file: {x}.") - - # Install requirements - try: - pipResult = subprocess.run( - [pythonExe, "-m", "pip", "install", "-r", reqFile], - capture_output=True, - text=True, - timeout=int(APP_CONFIG.get("Agent_Coder_INSTALL_TIMEOUT")) - ) - if pipResult.returncode != 0: - logger.debug(f"Error installing requirements: {pipResult.stderr}") - else: - logger.debug(f"Requirements installed successfully") - # Log installed packages if in debug mode - if logger.isEnabledFor(logging.DEBUG): - pipList = subprocess.run( - [pythonExe, "-m", "pip", "list"], - capture_output=True, - text=True - ) - logger.debug(f"Installed packages:\n{pipList.stdout}") - - except Exception as e: - logger.debug(f"Exception during requirements installation: {str(e)}") - - # 3. Write code to file - codeFile = os.path.join(self.tempDir, "code.py") - with open(codeFile, "w", encoding="utf-8") as f: - f.write(code) - - # 4. Execute code - logger.debug(f"Executing code with timeout of {self.executorTimeout} seconds. Code: {code}") - process = subprocess.run( - [pythonExe, codeFile], - timeout=self.executorTimeout, - capture_output=True, - text=True - ) - - # 5. Process results - stdout = process.stdout - stderr = process.stderr - - # Try to extract result from stdout - resultData = None - if process.returncode == 0: - try: - # Find the last line that might be JSON - jsonLines = [] - for line in stdout.strip().split('\n'): - line = line.strip() - if line and line[0] in '{[' and line[-1] in '}]': - try: - parsed = json.loads(line) - jsonLines.append((line, parsed)) - except json.JSONDecodeError: - continue - - # Use the last valid JSON that appears to be a dictionary - if jsonLines: - for line, parsed in reversed(jsonLines): - if isinstance(parsed, dict): - resultData = parsed - logger.debug(f"Extracted result data from stdout: {type(resultData)}") - break - except Exception as e: - logger.debug(f"Error extracting result from stdout: {str(e)}") - - # Enhanced logging of what was found - if resultData: - logger.info(f"Found result dictionary with {len(resultData)} entries: {list(resultData.keys())}") - else: - logger.warning("No result dictionary found in output") - - # Create result dictionary - return { - "success": process.returncode == 0, - "output": stdout, - "error": stderr if process.returncode != 0 else "", - "result": resultData, - "exitCode": process.returncode - } - - except subprocess.TimeoutExpired: - logger.error(f"Execution timed out after {self.executorTimeout} seconds") - return { - "success": False, - "output": "", - "error": f"Execution timed out after {self.executorTimeout} seconds", - "result": None, - "exitCode": -1 - } - except Exception as e: - logger.error(f"Execution error: {str(e)}") - return { - "success": False, - "output": "", - "error": f"Execution error: {str(e)}", - "result": None, - "exitCode": -1 - } - finally: - # Clean up resources - self._cleanupExecution() - - def _executeCode(self, code: str, requirements: List[str] = None) -> Dict[str, Any]: - """ - Execute Python code in the appropriate environment based on configuration. - - Args: - code: Python code to execute - requirements: List of required packages - - Returns: - Execution result dictionary - """ - # Check if we're in a production Azure environment - env_type = APP_CONFIG.get("APP_ENV_TYPE", "dev").lower() - - logger.info(f"Executing code in environment type: {env_type}") - - if env_type == "prod": - # Use the Azure-optimized execution method - logger.info("Using Azure-optimized code execution method") - return self._executeCodeProd(code, requirements) - else: - # Use the standard virtual environment execution method - logger.info("Using standard virtual environment execution method") - return self._executeCodeVenv(code, requirements) - - - def _cleanupExecution(self): - """Clean up temporary resources from code execution.""" - if self.tempDir and os.path.exists(self.tempDir): - try: - logger.debug(f"Cleaning up temporary directory: {self.tempDir}") - shutil.rmtree(self.tempDir) - self.tempDir = None - except Exception as e: - logger.warning(f"Error cleaning up temp directory: {str(e)}") - - def _cleanCode(self, code: str) -> str: - """Remove any markdown formatting or explanations.""" - # Remove code block markers - code = code.replace("```python", "").replace("```", "") - - # Remove explanations before or after code - lines = code.strip().split('\n') - startIndex = 0 - endIndex = len(lines) - - # Find start of actual code - for i, line in enumerate(lines): - if line.strip().startswith("inputFiles =") or line.strip().startswith("# REQUIREMENTS:"): - startIndex = i - break - - # Clean code - cleanedCode = '\n'.join(lines[startIndex:endIndex]) - return cleanedCode.strip() - - def formatAgentDocumentOutput(self, filename: str, content: str, contentType: str) -> ChatDocument: - """ - Format a document for agent output. - - Args: - filename: Output filename - content: Document content - contentType: MIME type of the content - - Returns: - ChatDocument object - """ - # Split filename into name and extension - name, ext = os.path.splitext(filename) - if ext.startswith('.'): - ext = ext[1:] - - # Create document object - return ChatDocument( - id=str(uuid.uuid4()), - name=name, - ext=ext, - data=content, - contents=[ - ChatContent( - name="main", - data=content, - summary=f"Generated {filename}", - metadata={"contentType": contentType} - ) - ] - ) - -# Factory function for the Coder agent -def getAgentCoder(): - """Returns an instance of the Coder agent.""" - return AgentCoder() \ No newline at end of file diff --git a/modules/agents/agentDocumentation.py b/modules/agents/agentDocumentation.py deleted file mode 100644 index 1cf3e3b2..00000000 --- a/modules/agents/agentDocumentation.py +++ /dev/null @@ -1,537 +0,0 @@ -""" -Documentation agent for generating structured documentation. -Provides comprehensive documentation generation capabilities. -""" - -import logging -from typing import Dict, Any, List, Optional -import json -import re -from datetime import datetime -import os -import hashlib -import base64 -import uuid -import shutil -from pathlib import Path -import traceback -import sys -import importlib.util -import inspect -from pydantic import BaseModel - -from modules.workflow.agentBase import AgentBase -from modules.interfaces.serviceChatModel import ChatContent - -logger = logging.getLogger(__name__) - -class AgentDocumentation(AgentBase): - """AI-driven agent for creating documentation and structured content using multi-step generation""" - - def __init__(self): - """Initialize the documentation agent""" - super().__init__() - self.name = "documentation" - self.label = "Documentation" - self.description = "Creates structured documentation, reports, and content using AI with multi-step generation" - self.capabilities = [ - "report_generation", - "documentation", - "content_structuring", - "technical_writing", - "knowledge_organization" - ] - - def setDependencies(self, serviceBase=None): - """Set external dependencies for the agent.""" - self.setService(serviceBase) - - async def processTask(self, task: Dict[str, Any]) -> Dict[str, Any]: - """ - Process a task by focusing on required outputs and using AI to generate them. - - Args: - task: Task dictionary with prompt, inputDocuments, outputSpecifications - - Returns: - Dictionary with feedback and documents - """ - try: - # Extract task information - prompt = task.get("prompt", "") - inputDocuments = task.get("inputDocuments", []) - outputSpecs = task.get("outputSpecifications", []) - - # Check AI service - if not self.service or not self.service.base: - return { - "feedback": "The Documentation agent requires an AI service to function.", - "documents": [] - } - - # Extract context from input documents - focusing only on dataExtracted - documentContext = self._extractDocumentContext(inputDocuments) - - # Create task analysis to understand the requirements - documentationPlan = await self._analyzeTask(prompt, documentContext, outputSpecs) - logger.debug(f"Documentation plan: {documentationPlan}") - - # Generate all required output documents - documents = [] - - # If no output specs provided, create default document - if not outputSpecs: - defaultFormat = documentationPlan.get("recommendedFormat", "markdown") - defaultTitle = documentationPlan.get("title", "Documentation") - safeTitle = self._sanitizeFilename(defaultTitle) - - outputSpecs = [ - {"label": f"{safeTitle}.{defaultFormat}", "description": "Comprehensive documentation"} - ] - - # Process each output specification - for spec in outputSpecs: - outputLabel = spec.get("label", "") - outputDescription = spec.get("description", "") - - # Generate the document using multi-step approach - document = await self._createDocumentMultiStep( - prompt, - documentContext, - outputLabel, - outputDescription, - documentationPlan - ) - - documents.append(document) - - # Generate feedback - feedback = documentationPlan.get("feedback", f"Created {len(documents)} documents based on your requirements.") - - return { - "feedback": feedback, - "documents": documents - } - - except Exception as e: - logger.error(f"Error in documentation generation: {str(e)}", exc_info=True) - return { - "feedback": f"Error during documentation generation: {str(e)}", - "documents": [] - } - - def _extractDocumentContext(self, documents: List[Dict[str, Any]]) -> str: - """ - Extract context from input documents, focusing on dataExtracted. - - Args: - documents: List of document objects - - Returns: - Extracted context as text - """ - contextParts = [] - - for doc in documents: - docName = doc.get("name", "unnamed") - if doc.get("ext"): - docName = f"{docName}.{doc.get('ext')}" - - contextParts.append(f"\n\n--- {docName} ---\n") - - # Process contents for dataExtracted - for content in doc.get("contents", []): - if content.get("dataExtracted"): - contextParts.append(content.get("dataExtracted", "")) - - return "\n".join(contextParts) - - def _sanitizeFilename(self, filename: str) -> str: - """ - Sanitize a filename by removing invalid characters. - - Args: - filename: Filename to sanitize - - Returns: - Sanitized filename - """ - # Replace invalid characters with underscores - invalidChars = r'<>:"/\|?*' - for char in invalidChars: - filename = filename.replace(char, '_') - - # Trim filename if too long - if len(filename) > 100: - filename = filename[:97] + "..." - - return filename - - async def _analyzeTask(self, prompt: str, context: str, outputSpecs: List) -> Dict: - """ - Use AI to analyze the task and create a documentation plan. - - Args: - prompt: The task prompt - context: Document context - outputSpecs: Output specifications - - Returns: - Documentation plan dictionary - """ - analysisPrompt = f""" - Analyze this documentation task and create a detailed plan. - - TASK: {prompt} - - DOCUMENT CONTEXT SAMPLE: - {context[:1000]}... (truncated) - - OUTPUT REQUIREMENTS: - {json.dumps(outputSpecs, indent=2)} - - Create a detailed documentation plan in JSON format with the following structure: - {{ - "title": "Document Title", - "documentType": "report|manual|guide|whitepaper|etc", - "audience": "technical|general|executive|etc", - "detailedStructure": [ - {{ - "title": "Chapter/Section Title", - "keyPoints": ["point1", "point2", ...], - "subsections": ["subsection1", "subsection2", ...], - "importance": "high|medium|low", - "estimatedLength": "short|medium|long" - }}, - ... more sections ... - ], - "keyTopics": ["topic1", "topic2", ...], - "tone": "formal|conversational|instructional|etc", - "recommendedFormat": "markdown|html|text|etc", - "formattingRequirements": ["requirement1", "requirement2", ...], - "executiveSummary": "Brief description of what the document will cover", - "feedback": "Brief message explaining the documentation approach" - }} - - Only return valid JSON. No preamble or explanations. - """ - - try: - response = await self.service.base.callAi([ - {"role": "system", "content": "You are a documentation expert. Respond with valid JSON only."}, - {"role": "user", "content": analysisPrompt} - ]) - - # Extract JSON from response - jsonStart = response.find('{') - jsonEnd = response.rfind('}') + 1 - - if jsonStart >= 0 and jsonEnd > jsonStart: - plan = json.loads(response[jsonStart:jsonEnd]) - return plan - else: - # Fallback if JSON not found - return { - "title": "Documentation (DEFAULT)", - "documentType": "report", - "audience": "general", - "detailedStructure": [ - { - "title": "Introduction", - "keyPoints": ["Purpose", "Scope"], - "subsections": [], - "importance": "high", - "estimatedLength": "short" - }, - { - "title": "Main Content", - "keyPoints": ["Core Information"], - "subsections": ["Key Findings", "Analysis"], - "importance": "high", - "estimatedLength": "long" - }, - { - "title": "Conclusion", - "keyPoints": ["Summary", "Next Steps"], - "subsections": [], - "importance": "medium", - "estimatedLength": "short" - } - ], - "keyTopics": ["General Information"], - "tone": "formal", - "recommendedFormat": "markdown", - "formattingRequirements": ["Clear headings", "Professional formatting"], - "executiveSummary": "A comprehensive documentation covering the requested topics.", - "feedback": "Created documentation based on your requirements." - } - - except Exception as e: - logger.warning(f"Error creating documentation plan: {str(e)}") - return { - "title": "Documentation", - "documentType": "report", - "audience": "general", - "detailedStructure": [ - { - "title": "Introduction", - "keyPoints": ["Purpose", "Scope"], - "subsections": [], - "importance": "high", - "estimatedLength": "short" - }, - { - "title": "Main Content", - "keyPoints": ["Core Information"], - "subsections": ["Key Findings", "Analysis"], - "importance": "high", - "estimatedLength": "long" - }, - { - "title": "Conclusion", - "keyPoints": ["Summary", "Next Steps"], - "subsections": [], - "importance": "medium", - "estimatedLength": "short" - } - ], - "keyTopics": ["General Information"], - "tone": "formal", - "recommendedFormat": "markdown", - "formattingRequirements": ["Clear headings", "Professional formatting"], - "executiveSummary": "A comprehensive documentation covering the requested topics.", - "feedback": "Created documentation based on your requirements." - } - - async def _createDocumentMultiStep(self, prompt: str, context: str, outputLabel: str, - outputDescription: str, documentationPlan: Dict) -> ChatContent: - """ - Create a document using a multi-step approach with separate AI calls for each section. - - Args: - prompt: Original task prompt - context: Document context - outputLabel: Output filename - outputDescription: Description of desired output - documentationPlan: Documentation plan from AI - - Returns: - ChatContent object - """ - try: - # Determine format from filename - formatType = outputLabel.split('.')[-1].lower() if '.' in outputLabel else "md" - - # Map format to contentType - contentTypeMap = { - "md": "text/markdown", - "markdown": "text/markdown", - "html": "text/html", - "txt": "text/plain", - "text": "text/plain", - "json": "application/json", - "csv": "text/csv" - } - - contentType = contentTypeMap.get(formatType, "text/plain") - - # Get document information - title = documentationPlan.get("title", "Documentation") - documentType = documentationPlan.get("documentType", "document") - audience = documentationPlan.get("audience", "general") - tone = documentationPlan.get("tone", "formal") - keyTopics = documentationPlan.get("keyTopics", []) - formattingRequirements = documentationPlan.get("formattingRequirements", []) - - # Get the detailed structure - detailedStructure = documentationPlan.get("detailedStructure", []) - - # Step 1: Generate executive summary - summaryPrompt = f""" - Create an executive summary for a {documentType} titled "{title}". - - DOCUMENT OVERVIEW: - - Type: {documentType} - - Audience: {audience} - - Key Topics: {', '.join(keyTopics)} - - TASK CONTEXT: {prompt} - - The executive summary should: - 1. Provide a concise overview of the document's purpose - 2. Highlight key points and findings - 3. Be clear and engaging for the target audience - 4. Set expectations for the document's content - - Keep the summary brief but comprehensive. - """ - - executiveSummary = await self.service.base.callAi([ - {"role": "system", "content": f"You are a documentation expert creating an executive summary in {formatType} format."}, - {"role": "user", "content": summaryPrompt} - ], produceUserAnswer = True) - - # Step 2: Generate introduction - introPrompt = f""" - Create an introduction for a {documentType} titled "{title}". - - DOCUMENT OVERVIEW: - - Type: {documentType} - - Audience: {audience} - - Key Topics: {', '.join(keyTopics)} - - TASK CONTEXT: {prompt} - - The introduction should: - 1. Set the context and purpose of the document - 2. Outline the scope and objectives - 3. Preview the main topics to be covered - 4. Engage the reader's interest - - Format the introduction according to {formatType} standards. - """ - - introduction = await self.service.base.callAi([ - {"role": "system", "content": f"You are a documentation expert creating an introduction in {formatType} format."}, - {"role": "user", "content": introPrompt} - ], produceUserAnswer = True) - - # Step 3: Generate main sections - sections = [] - for section in detailedStructure: - sectionTitle = section.get("title", "Section") - keyPoints = section.get("keyPoints", []) - subsections = section.get("subsections", []) - importance = section.get("importance", "medium") - estimatedLength = section.get("estimatedLength", "medium") - - sectionPrompt = f""" - Create the {sectionTitle} section for a {documentType} titled "{title}". - - SECTION DETAILS: - - Title: {sectionTitle} - - Key Points: {', '.join(keyPoints)} - - Subsections: {', '.join(subsections)} - - Importance: {importance} - - Estimated Length: {estimatedLength} - - DOCUMENT CONTEXT: - - Type: {documentType} - - Audience: {audience} - - Key Topics: {', '.join(keyTopics)} - - TASK CONTEXT: {prompt} - - The section should: - 1. Cover all key points thoroughly - 2. Include relevant subsections - 3. Maintain appropriate depth based on importance - 4. Follow the document's tone and style - - Format the section according to {formatType} standards. - """ - - sectionContent = await self.service.base.callAi([ - {"role": "system", "content": f"You are a documentation expert creating a section in {formatType} format."}, - {"role": "user", "content": sectionPrompt} - ], produceUserAnswer = True) - - sections.append(sectionContent) - - # Step 4: Generate conclusion - conclusionPrompt = f""" - Create the conclusion for a {documentType} titled "{title}". - - DOCUMENT OVERVIEW: - - Type: {documentType} - - Audience: {audience} - - Key Topics: {', '.join(keyTopics)} - - TASK CONTEXT: {prompt} - - This conclusion should: - 1. Summarize the key points covered in the document - 2. Provide closure to the topics discussed - 3. Include any relevant recommendations or next steps - 4. Leave the reader with a clear understanding of the document's significance - - The conclusion should be professional and impactful, formatted according to {formatType} standards. - """ - - conclusion = await self.service.base.callAi([ - {"role": "system", "content": f"You are a documentation expert creating a conclusion in {formatType} format."}, - {"role": "user", "content": conclusionPrompt} - ], produceUserAnswer = True) - - # Step 5: Assemble the complete document - if formatType in ["md", "markdown"]: - # Markdown format - documentContent = f"# {title}\n\n" - - if executiveSummary: - documentContent += f"## Executive Summary\n\n{executiveSummary}\n\n" - - documentContent += f"{introduction}\n\n" - - for i, sectionContent in enumerate(sections): - # Ensure section starts with heading if not already - sectionTitle = detailedStructure[i].get("title", f"Section {i+1}") - if not sectionContent.strip().startswith("#"): - documentContent += f"## {sectionTitle}\n\n" - documentContent += f"{sectionContent}\n\n" - - documentContent += f"## Conclusion\n\n{conclusion}\n" - - elif formatType == "html": - # HTML format - documentContent = f"\n\nThere was an error generating the documentation: {str(e)}
" - else: - content = f"Error in Documentation\n\nThere was an error generating the documentation: {str(e)}" - - return self.formatAgentDocumentOutput(outputLabel, content, contentType) - - -# Factory function for the Documentation agent -def getAgentDocumentation(): - """Returns an instance of the Documentation agent.""" - return AgentDocumentation() \ No newline at end of file diff --git a/modules/agents/agentEmail.py b/modules/agents/agentEmail.py deleted file mode 100644 index 6c6e2f5f..00000000 --- a/modules/agents/agentEmail.py +++ /dev/null @@ -1,380 +0,0 @@ -""" -Email Agent Module. -Handles email-related tasks using Microsoft Graph API. -""" - -import logging -import json -from typing import Dict, Any, List, Optional, Tuple -import uuid -import os - -from modules.workflow.agentBase import AgentBase -from modules.interfaces.serviceChatModel import Task, ChatDocument, ChatContent - -logger = logging.getLogger(__name__) - -class AgentEmail(AgentBase): - """Agent for handling email-related tasks.""" - - def __init__(self): - """Initialize the email agent.""" - super().__init__() - self.name = "email" - self.label = "Email Agent" - self.description = "Handles email composition and sending using Microsoft Graph API" - self.capabilities = [ - "email_composition", - "email_draft_creation", - "email_template_generation" - ] - self.serviceBase = None - - def setDependencies(self, serviceBase=None): - """Set external dependencies for the agent.""" - self.serviceBase = serviceBase - - async def processTask(self, task: Task) -> Dict[str, Any]: - """ - Process an email-related task. - - Args: - task: Task object containing: - - prompt: Instructions for the agent - - inputDocuments: List of documents to process - - outputSpecifications: List of required output documents - - context: Additional context including workflow info - - Returns: - Dictionary containing: - - feedback: Text response explaining what was done - - documents: List of created documents - """ - try: - # Extract task information - prompt = task.prompt - inputDocuments = task.filesInput - outputSpecs = task.filesOutput - - # Check AI service - if not self.service.base: - return { - "feedback": "The Email agent requires an AI service to function.", - "documents": [] - } - - # Check if Microsoft connector is available - if not hasattr(self.service, 'msft'): - return { - "feedback": "Microsoft connector not available. Please ensure Microsoft integration is properly configured.", - "documents": [] - } - - # Get Microsoft token - token_data = self.service.msft.getMsftToken() - if not token_data: - # Create authentication trigger document - auth_doc = self._createFrontendAuthTriggerDocument() - return { - "feedback": "Microsoft authentication required. Please authenticate to continue.", - "documents": [auth_doc] - } - - # Extract document data from input - documentContents, attachments = self._processInputDocuments(inputDocuments) - - # Generate email subject and body using AI - emailTemplate = await self._generateEmailTemplate(prompt, documentContents) - - # Create HTML preview of the email - htmlPreview = self._createHtmlPreview(emailTemplate) - - # Attempt to create a draft email using Microsoft Graph API - draft_result = self.service.msft.createDraftEmail( - emailTemplate["recipient"], - emailTemplate["subject"], - emailTemplate["htmlBody"], - attachments - ) - - # Prepare output documents - documents = [] - - # Process output specifications - for spec in outputSpecs: - label = spec.get("label", "") - description = spec.get("description", "") - - if label.endswith(".html"): - # Create the HTML template file - templateDoc = self.formatAgentDocumentOutput( - label, - emailTemplate["htmlBody"], # Use the actual HTML body, not the preview - "text/html" - ) - documents.append(templateDoc) - elif label.endswith(".json"): - # Create JSON template if requested - templateJson = json.dumps(emailTemplate, indent=2) - templateDoc = self.formatAgentDocumentOutput( - label, - templateJson, - "application/json" - ) - documents.append(templateDoc) - else: - # Default to preview for other cases - previewDoc = self.formatAgentDocumentOutput( - label, - htmlPreview, - "text/html" - ) - documents.append(previewDoc) - - # Prepare feedback message - if draft_result: - feedback = f"Email draft created successfully for {emailTemplate.get('recipient')}. The subject is: '{emailTemplate['subject']}'" - if attachments: - feedback += f" with {len(attachments)} attachment(s)" - feedback += ". You can open and edit it in your Outlook draft folder." - else: - feedback = "Email template created but could not save as draft. HTML preview and template are available as documents." - - return { - "feedback": feedback, - "documents": documents - } - - except Exception as e: - logger.error(f"Error in email agent: {str(e)}") - return { - "feedback": f"Error processing email task: {str(e)}", - "documents": [] - } - - def _createFrontendAuthTriggerDocument(self) -> ChatDocument: - """Create a document that triggers Microsoft authentication in the frontend.""" - return ChatDocument( - id=str(uuid.uuid4()), - name="microsoft_auth", - ext="html", - data=""" -Please click the button below to authenticate with Microsoft:
- -Please click the button below to authenticate with Microsoft:
- -This email is regarding your request: {prompt}
" - } - - except Exception as e: - logger.warning(f"Error generating email template: {str(e)}") - return { - "recipient": "recipient@example.com", - "subject": "Information Regarding Your Request", - "plainBody": f"This email is regarding your request: {prompt}", - "htmlBody": f"This email is regarding your request: {prompt}
" - } - - def _createHtmlPreview(self, emailTemplate: Dict[str, Any]) -> str: - """ - Create an HTML preview of the email template. - - Args: - emailTemplate: Email template dictionary - - Returns: - HTML string for preview - """ - html = f""" - - - - -No content
')} -Please click the button below to authenticate with Microsoft:
- -An error occurred: {str(e)}
" - else: - content = f"WEB RESEARCH ERROR\n\nAn error occurred: {str(e)}" - - return self.formatAgentDocumentOutput(outputLabel, content, contentType) - - async def _createJsonDocument(self, prompt: str, results: List[Dict[str, Any]], - researchPlan: Dict[str, Any], outputLabel: str) -> Dict[str, Any]: - """ - Create a JSON document from research results. - - Args: - prompt: Original research prompt - results: Research results - researchPlan: Research plan - outputLabel: Output filename - - Returns: - Document object - """ - try: - # Create structured data - sourcesData = [] - for result in results: - sourcesData.append({ - "title": result.get("title", "Untitled"), - "url": result.get("url", ""), - "summary": result.get("summary", ""), - "snippet": result.get("snippet", ""), - "sourceType": result.get("sourceType", "") - }) - - # Create metadata - metadata = { - "query": prompt, - "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), - "researchQuestions": researchPlan.get("researchQuestions", []), - "searchTerms": researchPlan.get("searchTerms", []) - } - - # Compile complete report object - jsonContent = { - "metadata": metadata, - "summary": researchPlan.get("feedback", "Web research results"), - "sources": sourcesData - } - - # Convert to JSON string - content = json.dumps(jsonContent, indent=2) - - return self.formatAgentDocumentOutput(outputLabel, content, "application/json") - - except Exception as e: - logger.error(f"Error creating JSON document: {str(e)}") - return self.formatAgentDocumentOutput(outputLabel, json.dumps({"error": str(e)}), "application/json") - - async def _createCsvDocument(self, results: List[Dict[str, Any]], outputLabel: str) -> Dict[str, Any]: - """ - Create a CSV document from research results. - - Args: - results: Research results - outputLabel: Output filename - - Returns: - Document object - """ - try: - # Create CSV header - csvLines = ["Title,URL,Source Type,Snippet"] - - # Add results - for result in results: - # Escape CSV fields - title = result.get("title", "").replace('"', '""') - url = result.get("url", "").replace('"', '""') - sourceType = result.get("sourceType", "").replace('"', '""') - snippet = result.get("snippet", "").replace('"', '""') - - csvLines.append(f'"{title}","{url}","{sourceType}","{snippet}"') - - # Combine into CSV content - content = "\n".join(csvLines) - - return self.formatAgentDocumentOutput(outputLabel, content, "text/csv") - - except Exception as e: - logger.error(f"Error creating CSV document: {str(e)}") - return self.formatAgentDocumentOutput(outputLabel, "Error,Error\nFailed to create CSV,{0}".format(str(e)), "text/csv") - - def _determineFormatType(self, outputLabel: str) -> str: - """ - Determine the format type based on the filename. - - Args: - outputLabel: Output filename - - Returns: - Format type (markdown, html, text, json, csv) - """ - outputLabelLower = outputLabel.lower() - - if outputLabelLower.endswith(".md"): - return "markdown" - elif outputLabelLower.endswith(".html"): - return "html" - elif outputLabelLower.endswith(".txt"): - return "text" - elif outputLabelLower.endswith(".json"): - return "json" - elif outputLabelLower.endswith(".csv"): - return "csv" - else: - # Default to markdown - return "markdown" - - def _searchWeb(self, query: str) -> List[Dict[str, str]]: - """ - Conduct a web search using SerpAPI and return the results. - - Args: - query: The search query - - Returns: - List of search results - """ - if not self.srcApikey: - return [] - - # Get user language from serviceBase if available - userLanguage = "en" # Default language - if self.service.base.userLanguage: - userLanguage = self.service.base.userLanguage - - try: - # Format the search request for SerpAPI - params = { - "engine": self.srcEngine, - "q": query, - "api_key": self.srcApikey, - "num": self.maxResults, # Number of results to return - "hl": userLanguage # Identified user language - } - - # Make the API request - response = requests.get("https://serpapi.com/search", params=params, timeout=self.timeout) - response.raise_for_status() - - # Parse JSON response - search_results = response.json() - - # Extract organic results - results = [] - - if "organic_results" in search_results: - for result in search_results["organic_results"][:self.maxResults]: - # Extract title - title = result.get("title", "No title") - - # Extract URL - url = result.get("link", "No URL") - - # Extract snippet - snippet = result.get("snippet", "No description") - - # Get actual page content - try: - targetPageSoup = self._readUrl(url) - content = self._extractMainContent(targetPageSoup) - except Exception as e: - logger.warning(f"Error extracting content from {url}: {str(e)}") - content = f"Error extracting content: {str(e)}" - - results.append({ - 'title': title, - 'url': url, - 'snippet': snippet, - 'data': content - }) - - # Limit number of results - if len(results) >= self.maxResults: - break - else: - logger.warning(f"No organic results found in SerpAPI response for: {query}") - - return results - - except Exception as e: - logger.error(f"Error searching with SerpAPI for {query}: {str(e)}") - return [] - - def _readUrl(self, url: str) -> BeautifulSoup: - """ - Read a URL and return a BeautifulSoup parser for the content. - - Args: - url: The URL to read - - Returns: - BeautifulSoup object with the content or None on errors - """ - if not url or not url.startswith(('http://', 'https://')): - return None - - headers = { - 'User-Agent': self.userAgent, - 'Accept': 'text/html,application/xhtml+xml,application/xml', - 'Accept-Language': 'en-US,en;q=0.9', - } - - try: - # Initial request - response = requests.get(url, headers=headers, timeout=self.timeout) - - # Handling for status 202 - if response.status_code == 202: - # Retry with backoff - backoffTimes = [0.5, 1.0, 2.0, 5.0] - - for waitTime in backoffTimes: - time.sleep(waitTime) - response = requests.get(url, headers=headers, timeout=self.timeout) - - if response.status_code != 202: - break - - # Raise for error status codes - response.raise_for_status() - - # Parse HTML - return BeautifulSoup(response.text, 'html.parser') - - except Exception as e: - logger.error(f"Error reading URL {url}: {str(e)}") - return None - - def _extractTitle(self, soup: BeautifulSoup, url: str) -> str: - """ - Extract the title from a webpage. - - Args: - soup: BeautifulSoup object of the webpage - url: URL of the webpage - - Returns: - Extracted title - """ - if not soup: - return f"Error with {url}" - - # Extract title from title tag - titleTag = soup.find('title') - title = titleTag.text.strip() if titleTag else "No title" - - # Alternative: Also look for h1 tags if title tag is missing - if title == "No title": - h1Tag = soup.find('h1') - if h1Tag: - title = h1Tag.text.strip() - - return title - - def _extractMainContent(self, soup: BeautifulSoup, maxChars: int = 10000) -> str: - """ - Extract the main content from an HTML page. - - Args: - soup: BeautifulSoup object of the webpage - maxChars: Maximum number of characters - - Returns: - Extracted main content as a string - """ - if not soup: - return "" - - # Try to find main content elements in priority order - mainContent = None - for selector in ['main', 'article', '#content', '.content', '#main', '.main']: - content = soup.select_one(selector) - if content: - mainContent = content - break - - # If no main content found, use the body - if not mainContent: - mainContent = soup.find('body') or soup - - # Remove script, style, nav, footer elements that don't contribute to main content - for element in mainContent.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'): - element.extract() - - # Extract text content - textContent = mainContent.get_text(separator=' ', strip=True) - - # Limit to maxChars - return textContent[:maxChars] - - def _limitText(self, text: str, maxChars: int = 10000) -> str: - """ - Limit text to a maximum number of characters. - - Args: - text: Input text - maxChars: Maximum number of characters - - Returns: - Limited text - """ - if not text: - return "" - - # If text is already under the limit, return unchanged - if len(text) <= maxChars: - return text - - # Otherwise limit text to maxChars - return text[:maxChars] + "... [Content truncated due to length]" - - -# Factory function for the Webcrawler agent -def getAgentWebcrawler(): - """Returns an instance of the Webcrawler agent.""" - return AgentWebcrawler() \ No newline at end of file diff --git a/modules/interfaces/serviceChatClass.py b/modules/interfaces/serviceChatClass.py index 509eeb55..b14ee795 100644 --- a/modules/interfaces/serviceChatClass.py +++ b/modules/interfaces/serviceChatClass.py @@ -12,7 +12,6 @@ from typing import Dict, Any, List, Optional, Union import hashlib import asyncio -from modules.shared.mimeUtils import isTextMimeType from modules.interfaces.serviceChatAccess import ChatAccess from modules.interfaces.serviceChatModel import ( ChatContent, ChatDocument, ChatStat, ChatMessage, @@ -20,6 +19,7 @@ from modules.interfaces.serviceChatModel import ( Task, TaskPlan, UserInputRequest ) from modules.interfaces.serviceAppModel import User +from modules.workflow.managerDocument import DocumentManager # DYNAMIC PART: Connectors to the Interface from modules.connectors.connectorDbJson import DatabaseConnector @@ -46,6 +46,9 @@ class ChatInterface: self.mandateId = currentUser.mandateId if currentUser else None self.access = None # Will be set when user context is provided + # Initialize services + self._initializeServices() + # Initialize database self._initializeDatabase() @@ -53,6 +56,10 @@ class ChatInterface: if currentUser: self.setUserContext(currentUser) + def _initializeServices(self): + """Initialize service dependencies""" + self.documentManager = DocumentManager(self.service) + def setUserContext(self, currentUser: User): """Sets the user context for the interface.""" if not currentUser: @@ -380,23 +387,9 @@ class ChatInterface: messageData["id"] = f"msg_{uuid.uuid4()}" logger.warning(f"Automatically generated ID for workflow message: {messageData['id']}") - # Ensure required fields are present - if "startedAt" not in messageData and "createdAt" not in messageData: - messageData["startedAt"] = self._getCurrentTimestamp() - - if "createdAt" in messageData and "startedAt" not in messageData: - messageData["startedAt"] = messageData["createdAt"] - del messageData["createdAt"] - # Set status if not present if "status" not in messageData: - messageData["status"] = "completed" - - # Set sequence number if not present - if "sequenceNo" not in messageData: - # Get current messages to determine next sequence number - existingMessages = self.getWorkflowMessages(workflowId) - messageData["sequenceNo"] = len(existingMessages) + 1 + messageData["status"] = "step" # Default status for intermediate messages # Ensure role and agentName are present if "role" not in messageData: @@ -427,10 +420,9 @@ class ChatInterface: documents=[ChatDocument(**doc) for doc in createdMessage.get("documents", [])], message=createdMessage.get("message"), role=createdMessage.get("role", "assistant"), - status=createdMessage.get("status", "completed"), - sequenceNr=createdMessage.get("sequenceNo", 0), - startedAt=createdMessage.get("startedAt", self._getCurrentTimestamp()), - finishedAt=createdMessage.get("finishedAt"), + status=createdMessage.get("status", "step"), + sequenceNr=len(messageIds), # Set sequence number based on message position + publishedAt=createdMessage.get("publishedAt", self._getCurrentTimestamp()), stats=ChatStat(**createdMessage.get("stats", {})) if createdMessage.get("stats") else None ) except Exception as e: @@ -848,7 +840,6 @@ class ChatInterface: async def workflowStart(self, userInput: UserInputRequest, workflowId: Optional[str] = None) -> ChatWorkflow: """ Starts a new workflow or continues an existing one. - Corresponds to State 1 in the state machine documentation. Args: userInput: The user input request containing workflow initialization data @@ -861,29 +852,40 @@ class ChatInterface: # Get current timestamp currentTime = self._getCurrentTimestamp() + # Process files if any + documents = [] + if userInput.listFileId: + documents = await self._processFileIds(userInput.listFileId) + + # Create initial message + initialMessage = ChatMessage( + id=str(uuid.uuid4()), + role="user", + content=userInput.prompt, + timestamp=currentTime, + documents=documents + ) + if workflowId: # Continue existing workflow workflow = self.getWorkflow(workflowId) if not workflow: raise ValueError(f"Workflow {workflowId} not found") - # Update workflow status - workflow.status = "running" - workflow.lastActivity = currentTime - - # Update in database - self.updateWorkflow(workflowId, { - "status": "running", - "lastActivity": currentTime + # Add message to workflow + self.createWorkflowMessage({ + "workflowId": workflowId, + "messageId": initialMessage.id, + "role": initialMessage.role, + "content": initialMessage.content, + "timestamp": initialMessage.timestamp, + "documents": [doc.dict() for doc in initialMessage.documents] }) - # Add log entry - self.createWorkflowLog({ - "workflowId": workflowId, - "message": "Workflow continued", - "type": "info", - "status": "running", - "progress": 0 + # Update workflow + self.updateWorkflow(workflowId, { + "lastActivity": currentTime, + "currentRound": workflow.currentRound + 1 }) else: @@ -895,10 +897,10 @@ class ChatInterface: "lastActivity": currentTime, "currentRound": 1, "mandateId": self.mandateId, - "messageIds": [], + "messageIds": [initialMessage.id], "dataStats": { - "totalMessages": 0, - "totalDocuments": 0, + "totalMessages": 1, + "totalDocuments": len(documents), "totalTokens": 0 } } @@ -906,6 +908,16 @@ class ChatInterface: # Create workflow workflow = self.createWorkflow(workflowData) + # Add initial message + self.createWorkflowMessage({ + "workflowId": workflow.id, + "messageId": initialMessage.id, + "role": initialMessage.role, + "content": initialMessage.content, + "timestamp": initialMessage.timestamp, + "documents": [doc.dict() for doc in initialMessage.documents] + }) + # Add log entry self.createWorkflowLog({ "workflowId": workflow.id, @@ -916,8 +928,8 @@ class ChatInterface: }) # Start workflow processing - from modules.workflow.workflowManager import getWorkflowManager - workflowManager = await getWorkflowManager(self) + from modules.workflow.managerWorkflow import WorkflowManager + workflowManager = WorkflowManager(self) asyncio.create_task(workflowManager.workflowProcess(userInput, workflow)) return workflow @@ -979,30 +991,22 @@ class ChatInterface: """ documents = [] for fileId in fileIds: - try: - # Get file content - fileContent = self.service.functions.getFileData(fileId) - if not fileContent: - continue - - # Get file metadata - fileMetadata = self.service.functions.getFile(fileId) - if not fileMetadata: - continue - - # Create ChatDocument - document = ChatDocument( - id=str(uuid.uuid4()), - fileId=fileId, - filename=fileMetadata.get("name", "Unknown"), - fileSize=fileMetadata.get("size", 0), - content=fileContent.decode('utf-8', errors='ignore'), - mimeType=fileMetadata.get("mimeType", "text/plain") - ) - documents.append(document) - except Exception as e: - logger.error(f"Error processing file {fileId}: {str(e)}") + # Get file metadata + fileMetadata = self.service.functions.getFile(fileId) + if not fileMetadata: + logger.warning(f"File metadata not found for {fileId}") continue + + # Create ChatDocument + document = ChatDocument( + id=str(uuid.uuid4()), + fileId=fileId, + filename=fileMetadata.get("name", "Unknown"), + fileSize=fileMetadata.get("size", 0), + mimeType=fileMetadata.get("mimeType", "text/plain") + ) + + documents.append(document) return documents diff --git a/modules/interfaces/serviceChatModel.py b/modules/interfaces/serviceChatModel.py index a64bb7cd..22a1f81f 100644 --- a/modules/interfaces/serviceChatModel.py +++ b/modules/interfaces/serviceChatModel.py @@ -4,12 +4,38 @@ Chat model classes for the chat system. from pydantic import BaseModel, Field from typing import List, Dict, Any, Optional, Union -from datetime import datetime +from datetime import datetime, UTC import uuid +from enum import Enum from modules.shared.attributeUtils import register_model_labels, ModelMixin +# ENUMS + +class TaskStatus(str, Enum): + """Task status enumeration""" + PENDING = "pending" + RUNNING = "running" + COMPLETED = "completed" + FAILED = "failed" + CANCELLED = "cancelled" + ROLLED_BACK = "rolled_back" + +# Register labels for TaskStatus +register_model_labels( + "TaskStatus", + {"en": "Task Status", "fr": "Statut de la tâche"}, + { + "PENDING": {"en": "Pending", "fr": "En attente"}, + "RUNNING": {"en": "Running", "fr": "En cours"}, + "COMPLETED": {"en": "Completed", "fr": "Terminé"}, + "FAILED": {"en": "Failed", "fr": "Échec"}, + "CANCELLED": {"en": "Cancelled", "fr": "Annulé"}, + "ROLLED_BACK": {"en": "Rolled Back", "fr": "Annulé"} + } +) + # USER MODELS class UserInputRequest(BaseModel, ModelMixin): @@ -28,24 +54,49 @@ register_model_labels( } ) -# WORKFLOW MODELS +# DOCUMENT MODELS -class ChatContent(BaseModel, ModelMixin): - """Data model for chat content""" - sequenceNr: int = Field(description="Sequence number of the content") - name: str = Field(description="Name of the content") - data: str = Field(description="The actual content data") - mimeType: str = Field(description="MIME type of the content") - metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata") -# Register labels for ChatContent +class ContentMetadata(BaseModel, ModelMixin): + """Metadata for content items""" + size: int = Field(description="Content size in bytes") + pages: Optional[int] = Field(None, description="Number of pages for multi-page content") + error: Optional[str] = Field(None, description="Processing error if any") + # Media-specific attributes + width: Optional[int] = Field(None, description="Width in pixels for images/videos") + height: Optional[int] = Field(None, description="Height in pixels for images/videos") + colorMode: Optional[str] = Field(None, description="Color mode (e.g., RGB, CMYK, grayscale)") + fps: Optional[float] = Field(None, description="Frames per second for videos") + durationSec: Optional[float] = Field(None, description="Duration in seconds for videos/audio") + +# Register labels for ContentMetadata register_model_labels( - "ChatContent", - {"en": "Chat Content", "fr": "Contenu de chat"}, + "ContentMetadata", + {"en": "Content Metadata", "fr": "Métadonnées du contenu"}, { - "sequenceNr": {"en": "Sequence Number", "fr": "Numéro de séquence"}, - "name": {"en": "Name", "fr": "Nom"}, + "size": {"en": "Size", "fr": "Taille"}, + "pages": {"en": "Pages", "fr": "Pages"}, + "error": {"en": "Error", "fr": "Erreur"}, + "width": {"en": "Width", "fr": "Largeur"}, + "height": {"en": "Height", "fr": "Hauteur"}, + "colorMode": {"en": "Color Mode", "fr": "Mode de couleur"}, + "fps": {"en": "FPS", "fr": "IPS"}, + "durationSec": {"en": "Duration", "fr": "Durée"} + } +) + +class ContentItem(BaseModel, ModelMixin): + """Individual content item from a document""" + label: str = Field(description="Content label (e.g., tab name, tag name)") + data: str = Field(description="Extracted text content") + metadata: ContentMetadata = Field(description="Content metadata") + +# Register labels for ContentItem +register_model_labels( + "ContentItem", + {"en": "Content Item", "fr": "Élément de contenu"}, + { + "label": {"en": "Label", "fr": "Étiquette"}, "data": {"en": "Data", "fr": "Données"}, - "mimeType": {"en": "MIME Type", "fr": "Type MIME"}, "metadata": {"en": "Metadata", "fr": "Métadonnées"} } ) @@ -57,7 +108,7 @@ class ChatDocument(BaseModel, ModelMixin): filename: str = Field(description="Name of the file") fileSize: int = Field(description="Size of the file") mimeType: str = Field(description="MIME type of the file") - contents: List[ChatContent] = Field(default_factory=list, description="List of chat contents") + # Register labels for ChatDocument register_model_labels( "ChatDocument", @@ -67,11 +118,50 @@ register_model_labels( "fileId": {"en": "File ID", "fr": "ID du fichier"}, "filename": {"en": "Filename", "fr": "Nom de fichier"}, "fileSize": {"en": "File Size", "fr": "Taille du fichier"}, + "mimeType": {"en": "MIME Type", "fr": "Type MIME"} + } +) + +class TaskDocument(BaseModel, ModelMixin): + """Data model for a task document""" + id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key") + data: str = Field(description="Base64 encoded file data") + filename: str = Field(description="Name of the file") + fileSize: int = Field(description="Size of the file") + mimeType: str = Field(description="MIME type of the file") + +# Register labels for TaskDocument +register_model_labels( + "TaskDocument", + {"en": "Task Document", "fr": "Document de tâche"}, + { + "id": {"en": "ID", "fr": "ID"}, + "filename": {"en": "Filename", "fr": "Nom de fichier"}, + "fileSize": {"en": "File Size", "fr": "Taille du fichier"}, "mimeType": {"en": "MIME Type", "fr": "Type MIME"}, + "data": {"en": "Data", "fr": "Données"} + } +) + +class ExtractedContent(BaseModel, ModelMixin): + """Data model for extracted content""" + objectId: str = Field(description="Reference to source document") + objectType: str = Field(description="Type of source object ('ChatDocument' or 'TaskDocument')") + contents: List[ContentItem] = Field(default_factory=list, description="List of content items") + +# Register labels for ExtractedContent +register_model_labels( + "ExtractedContent", + {"en": "Extracted Content", "fr": "Contenu extrait"}, + { + "objectId": {"en": "Object ID", "fr": "ID de l'objet"}, + "objectType": {"en": "Object Type", "fr": "Type d'objet"}, "contents": {"en": "Contents", "fr": "Contenus"} } ) +# WORKFLOW MODELS + class ChatStat(BaseModel, ModelMixin): """Data model for chat statistics""" id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key") @@ -133,10 +223,9 @@ class ChatMessage(BaseModel, ModelMixin): documents: List[ChatDocument] = Field(default_factory=list, description="Associated documents") message: Optional[str] = Field(None, description="Message content") role: str = Field(description="Role of the message sender") - status: str = Field(description="Status of the message") - sequenceNr: int = Field(description="Sequence number of the message") - startedAt: str = Field(description="When the message processing started") - finishedAt: Optional[str] = Field(None, description="When the message processing finished") + status: str = Field(description="Status of the message (first, step, last)") + sequenceNr: int = Field(description="Sequence number of the message (set automatically)") + publishedAt: str = Field(description="When the message was published") stats: Optional[ChatStat] = Field(None, description="Statistics for this message") success: Optional[bool] = Field(None, description="Whether the message processing was successful") # Register labels for ChatMessage @@ -153,29 +242,92 @@ register_model_labels( "role": {"en": "Role", "fr": "Rôle"}, "status": {"en": "Status", "fr": "Statut"}, "sequenceNr": {"en": "Sequence Number", "fr": "Numéro de séquence"}, - "startedAt": {"en": "Started At", "fr": "Démarré le"}, - "finishedAt": {"en": "Finished At", "fr": "Terminé le"}, + "publishedAt": {"en": "Published At", "fr": "Publié le"}, "stats": {"en": "Statistics", "fr": "Statistiques"}, "success": {"en": "Success", "fr": "Succès"} } ) class AgentTask(BaseModel, ModelMixin): - """Data model for a task""" - id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key") - workflowId: str = Field(description="Foreign key to workflow") - agentName: str = Field(description="Name of the agent assigned to this task") - status: str = Field(description="Current status of the task") - progress: float = Field(description="Task progress (0-100)") - prompt: str = Field(description="Prompt for the task") - userLanguage: str = Field(description="User's preferred language") - filesInput: List[str] = Field(default_factory=list, description="Input files") - filesOutput: List[str] = Field(default_factory=list, description="Output files") - result: Optional[ChatMessage] = Field(None, description="Task result message") - error: Optional[str] = Field(None, description="Error message if failed") - startedAt: str = Field(description="When the task started") - finishedAt: Optional[str] = Field(None, description="When the task finished") - performance: Optional[Dict[str, Any]] = Field(None, description="Performance metrics") + """Model for agent tasks""" + id: str = Field(..., description="Unique task identifier") + workflowId: str = Field(..., description="Associated workflow ID") + status: TaskStatus = Field(default=TaskStatus.PENDING, description="Current task status") + error: Optional[str] = Field(None, description="Error message if task failed") + startedAt: Optional[datetime] = Field(None, description="Task start timestamp") + finishedAt: Optional[datetime] = Field(None, description="Task completion timestamp") + actionList: List[Dict[str, Any]] = Field(default_factory=list, description="List of actions to execute") + documentsOutput: List[Dict[str, Any]] = Field(default_factory=list, description="Output documents") + retryCount: int = Field(default=0, description="Number of retry attempts") + retryMax: int = Field(default=3, description="Maximum number of retry attempts") + rollbackOnFailure: bool = Field(default=True, description="Whether to rollback on failure") + dependencies: List[str] = Field(default_factory=list, description="List of dependent task IDs") + thisTaskFeedback: Optional[Dict[str, Any]] = Field(None, description="Task feedback data") + + def isCompleted(self) -> bool: + """Check if task is completed""" + return self.status == TaskStatus.COMPLETED + + def isFailed(self) -> bool: + """Check if task has failed""" + return self.status == TaskStatus.FAILED + + def canRetry(self) -> bool: + """Check if task can be retried""" + return self.retryCount < self.retryMax + + def start(self) -> None: + """Start the task""" + self.status = TaskStatus.RUNNING + self.startedAt = datetime.now(UTC) + + def complete(self) -> None: + """Mark task as completed""" + self.status = TaskStatus.COMPLETED + self.finishedAt = datetime.now(UTC) + + def fail(self, error: str) -> None: + """Mark task as failed""" + self.status = TaskStatus.FAILED + self.error = error + self.finishedAt = datetime.now(UTC) + + def cancel(self) -> None: + """Cancel the task""" + self.status = TaskStatus.CANCELLED + self.finishedAt = datetime.now(UTC) + + def rollback(self) -> None: + """Mark task as rolled back""" + self.status = TaskStatus.ROLLED_BACK + self.finishedAt = datetime.now(UTC) + + def incrementRetry(self) -> None: + """Increment retry count""" + self.retryCount += 1 + + def addDependency(self, taskId: str) -> None: + """Add a task dependency""" + if taskId not in self.dependencies: + self.dependencies.append(taskId) + + def removeDependency(self, taskId: str) -> None: + """Remove a task dependency""" + if taskId in self.dependencies: + self.dependencies.remove(taskId) + + def addAction(self, action: Dict[str, Any]) -> None: + """Add an action to the task""" + self.actionList.append(action) + + def addDocumentOutput(self, document: Dict[str, Any]) -> None: + """Add an output document""" + self.documentsOutput.append(document) + + def setFeedback(self, feedback: Dict[str, Any]) -> None: + """Set task feedback""" + self.thisTaskFeedback = feedback + # Register labels for AgentTask register_model_labels( "AgentTask", @@ -183,42 +335,21 @@ register_model_labels( { "id": {"en": "ID", "fr": "ID"}, "workflowId": {"en": "Workflow ID", "fr": "ID du flux de travail"}, - "agentName": {"en": "Agent Name", "fr": "Nom de l'agent"}, "status": {"en": "Status", "fr": "Statut"}, - "progress": {"en": "Progress", "fr": "Progression"}, - "prompt": {"en": "Prompt", "fr": "Invite"}, - "userLanguage": {"en": "User Language", "fr": "Langue de l'utilisateur"}, - "filesInput": {"en": "Input Files", "fr": "Fichiers d'entrée"}, - "filesOutput": {"en": "Output Files", "fr": "Fichiers de sortie"}, - "result": {"en": "Result", "fr": "Résultat"}, "error": {"en": "Error", "fr": "Erreur"}, "startedAt": {"en": "Started At", "fr": "Démarré le"}, "finishedAt": {"en": "Finished At", "fr": "Terminé le"}, - "performance": {"en": "Performance", "fr": "Performance"} + "actionList": {"en": "Action List", "fr": "Liste d'actions"}, + "documentsOutput": {"en": "Output Documents", "fr": "Documents de sortie"}, + "retryCount": {"en": "Retry Count", "fr": "Nombre de tentatives"}, + "retryMax": {"en": "Max Retries", "fr": "Tentatives maximales"}, + "rollbackOnFailure": {"en": "Rollback on Failure", "fr": "Annulation en cas d'échec"}, + "dependencies": {"en": "Dependencies", "fr": "Dépendances"}, + "thisTaskFeedback": {"en": "Task Feedback", "fr": "Retour sur la tâche"} } ) -class Agent(BaseModel, ModelMixin): - """Data model for an agent""" - id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key") - name: str = Field(description="Name of the agent") - description: str = Field(description="Description of the agent") - capabilities: List[str] = Field(default_factory=list, description="List of agent capabilities") - performance: Optional[Dict[str, Any]] = Field(None, description="Performance metrics") -# Register labels for Agent -register_model_labels( - "Agent", - {"en": "Agent", "fr": "Agent"}, - { - "id": {"en": "ID", "fr": "ID"}, - "name": {"en": "Name", "fr": "Nom"}, - "description": {"en": "Description", "fr": "Description"}, - "capabilities": {"en": "Capabilities", "fr": "Capacités"}, - "performance": {"en": "Performance", "fr": "Performance"} - } -) - -# WORKFLOW MODELS +# WORKFLOW MODEL class ChatWorkflow(BaseModel, ModelMixin): """Data model for a chat workflow""" @@ -251,125 +382,3 @@ register_model_labels( "tasks": {"en": "Tasks", "fr": "Tâches"} } ) - -# DOCUMENT MODELS - -class DocumentExtraction(BaseModel, ModelMixin): - """Data model for document extraction history""" - timestamp: str = Field(description="Timestamp of extraction") - type: str = Field(description="Type of document") - sections: List[str] = Field(default_factory=list, description="Extracted sections") - metadata: Dict[str, Any] = Field(default_factory=dict, description="Extraction metadata") - -# Register labels for DocumentExtraction -register_model_labels( - "DocumentExtraction", - {"en": "Document Extraction", "fr": "Extraction de document"}, - { - "timestamp": {"en": "Timestamp", "fr": "Horodatage"}, - "type": {"en": "Type", "fr": "Type"}, - "sections": {"en": "Sections", "fr": "Sections"}, - "metadata": {"en": "Metadata", "fr": "Métadonnées"} - } -) - -class DocumentContext(BaseModel, ModelMixin): - """Data model for document context""" - id: str = Field(description="Document ID") - extractionHistory: List[DocumentExtraction] = Field(default_factory=list, description="History of extractions") - relevantSections: List[str] = Field(default_factory=list, description="Relevant sections") - processingStatus: Dict[str, str] = Field(default_factory=dict, description="Processing status") - -# Register labels for DocumentContext -register_model_labels( - "DocumentContext", - {"en": "Document Context", "fr": "Contexte de document"}, - { - "id": {"en": "ID", "fr": "ID"}, - "extractionHistory": {"en": "Extraction History", "fr": "Historique d'extraction"}, - "relevantSections": {"en": "Relevant Sections", "fr": "Sections pertinentes"}, - "processingStatus": {"en": "Processing Status", "fr": "Statut de traitement"} - } -) - -class DocumentMetadata(BaseModel, ModelMixin): - """Data model for document metadata""" - type: str = Field(description="Document type") - format: str = Field(description="Document format") - size: int = Field(description="Document size in bytes") - pages: Optional[int] = Field(None, description="Number of pages") - sections: Optional[List[str]] = Field(None, description="Document sections") - error: Optional[str] = Field(None, description="Processing error if any") - -# Register labels for DocumentMetadata -register_model_labels( - "DocumentMetadata", - {"en": "Document Metadata", "fr": "Métadonnées de document"}, - { - "type": {"en": "Type", "fr": "Type"}, - "format": {"en": "Format", "fr": "Format"}, - "size": {"en": "Size", "fr": "Taille"}, - "pages": {"en": "Pages", "fr": "Pages"}, - "sections": {"en": "Sections", "fr": "Sections"}, - "error": {"en": "Error", "fr": "Erreur"} - } -) - -class ImageData(BaseModel, ModelMixin): - """Data model for image data""" - data: str = Field(description="Base64 encoded image data") - format: str = Field(description="Image format") - page: Optional[int] = Field(None, description="Page number if from a multi-page document") - index: Optional[int] = Field(None, description="Image index in the document") - -# Register labels for ImageData -register_model_labels( - "ImageData", - {"en": "Image Data", "fr": "Données d'image"}, - { - "data": {"en": "Image Data", "fr": "Données d'image"}, - "format": {"en": "Format", "fr": "Format"}, - "page": {"en": "Page", "fr": "Page"}, - "index": {"en": "Index", "fr": "Index"} - } -) - -class DocumentContent(BaseModel, ModelMixin): - """Data model for document content""" - text: Optional[str] = Field(None, description="Extracted text content") - data: Optional[Dict[str, Any]] = Field(None, description="Structured data content") - images: Optional[List[ImageData]] = Field(None, description="Extracted images") - metadata: DocumentMetadata = Field(description="Document metadata") - -# Register labels for DocumentContent -register_model_labels( - "DocumentContent", - {"en": "Document Content", "fr": "Contenu de document"}, - { - "text": {"en": "Text", "fr": "Texte"}, - "data": {"en": "Data", "fr": "Données"}, - "images": {"en": "Images", "fr": "Images"}, - "metadata": {"en": "Metadata", "fr": "Métadonnées"} - } -) - -class ProcessedDocument(BaseModel, ModelMixin): - """Data model for processed document""" - id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Document ID") - name: str = Field(description="Document name") - contentType: str = Field(description="Content type") - content: DocumentContent = Field(description="Document content") - context: Optional[DocumentContext] = Field(None, description="Document context") - -# Register labels for ProcessedDocument -register_model_labels( - "ProcessedDocument", - {"en": "Processed Document", "fr": "Document traité"}, - { - "id": {"en": "ID", "fr": "ID"}, - "name": {"en": "Name", "fr": "Nom"}, - "contentType": {"en": "Content Type", "fr": "Type de contenu"}, - "content": {"en": "Content", "fr": "Contenu"}, - "context": {"en": "Context", "fr": "Contexte"} - } -) diff --git a/modules/interfaces/serviceManagementClass.py b/modules/interfaces/serviceManagementClass.py index 4d398eeb..bdead684 100644 --- a/modules/interfaces/serviceManagementClass.py +++ b/modules/interfaces/serviceManagementClass.py @@ -11,7 +11,6 @@ from typing import Dict, Any, List, Optional, Union import hashlib -from modules.shared.mimeUtils import isTextMimeType from modules.interfaces.serviceManagementAccess import ManagementAccess from modules.interfaces.serviceManagementModel import ( Prompt, FileItem, FileData diff --git a/modules/methods/methodBase.py b/modules/methods/methodBase.py index 7618b5a8..2eaa845f 100644 --- a/modules/methods/methodBase.py +++ b/modules/methods/methodBase.py @@ -2,8 +2,12 @@ from enum import Enum from typing import Dict, List, Optional, Any, Literal from datetime import datetime, UTC from pydantic import BaseModel, Field +import logging + +logger = logging.getLogger(__name__) class AuthSource(str, Enum): + """Authentication source enumeration""" LOCAL = "local" MSFT = "msft" GOOGLE = "google" @@ -23,52 +27,122 @@ class MethodResult(BaseModel): data: Dict[str, Any] metadata: Dict[str, Any] = Field(default_factory=dict) validation: List[str] = Field(default_factory=list) + error: Optional[str] = Field(None, description="Error message if any") class MethodBase: """Base class for all methods""" - def __init__(self): + def __init__(self, serviceContainer: Any): + """Initialize method with service container""" + self.service = serviceContainer self.name: str self.description: str - self.auth_source: AuthSource = AuthSource.LOCAL # Default to local auth + self.authSource: AuthSource = AuthSource.LOCAL # Default to local auth + self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") @property def actions(self) -> Dict[str, Dict[str, Any]]: """Available actions and their parameters""" raise NotImplementedError - async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> MethodResult: - """Execute method action with authentication data""" + async def execute(self, action: str, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> MethodResult: + """ + Execute method action with authentication data + + Args: + action: The action to execute + parameters: Action parameters + authData: Authentication data + + Returns: + MethodResult containing execution results + + Raises: + ValueError: If action is not supported + RuntimeError: If authentication fails + """ + try: + # Validate action + if action not in self.actions: + raise ValueError(f"Unsupported action: {action}") + + # Validate parameters + if not await self.validateParameters(action, parameters): + return self._createResult( + success=False, + data={}, + error="Invalid parameters" + ) + + # Validate authentication + if not self._validateAuth(authData): + return self._createResult( + success=False, + data={}, + error="Authentication failed" + ) + + # Execute action + return await self._executeAction(action, parameters, authData) + + except Exception as e: + self.logger.error(f"Error executing action {action}: {str(e)}") + return self._createResult( + success=False, + data={}, + error=str(e) + ) + + async def _executeAction(self, action: str, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> MethodResult: + """Execute specific action - to be implemented by subclasses""" raise NotImplementedError - async def validate_parameters(self, action: str, parameters: Dict[str, Any]) -> bool: + async def validateParameters(self, action: str, parameters: Dict[str, Any]) -> bool: """Validate action parameters""" - if action not in self.actions: - return False + try: + if action not in self.actions: + return False + + actionDef = self.actions[action] + requiredParams = {k for k, v in actionDef['parameters'].items() if v['required']} + return all(param in parameters for param in requiredParams) - action_def = self.actions[action] - required_params = {k for k, v in action_def['parameters'].items() if v['required']} - return all(param in parameters for param in required_params) + except Exception as e: + self.logger.error(f"Error validating parameters: {str(e)}") + return False - async def rollback(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> None: + async def rollback(self, action: str, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> None: """Rollback action if needed""" + try: + await self._rollbackAction(action, parameters, authData) + except Exception as e: + self.logger.error(f"Error rolling back action {action}: {str(e)}") + raise + + async def _rollbackAction(self, action: str, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> None: + """Rollback specific action - to be implemented by subclasses""" pass - def _validate_auth(self, auth_data: Optional[Dict[str, Any]] = None) -> bool: + def _validateAuth(self, authData: Optional[Dict[str, Any]] = None) -> bool: """Validate authentication data""" - if self.auth_source == AuthSource.LOCAL: - return True - return bool(auth_data and auth_data.get('source') == self.auth_source) + try: + if self.authSource == AuthSource.LOCAL: + return True + return bool(authData and authData.get('source') == self.authSource) + except Exception as e: + self.logger.error(f"Error validating auth: {str(e)}") + return False - def _create_result(self, success: bool, data: Dict[str, Any], metadata: Optional[Dict[str, Any]] = None) -> MethodResult: + def _createResult(self, success: bool, data: Dict[str, Any], metadata: Optional[Dict[str, Any]] = None, error: Optional[str] = None) -> MethodResult: """Create a method result""" return MethodResult( success=success, data=data, metadata=metadata or {}, - validation=[] + validation=[], + error=error ) - def _add_validation_message(self, result: MethodResult, message: str) -> None: + def _addValidationMessage(self, result: MethodResult, message: str) -> None: """Add a validation message to the result""" result.validation.append(message) \ No newline at end of file diff --git a/modules/methods/methodCoder.py b/modules/methods/methodCoder.py index d39c1b08..f9b06f21 100644 --- a/modules/methods/methodCoder.py +++ b/modules/methods/methodCoder.py @@ -14,7 +14,7 @@ class MethodCoder(MethodBase): super().__init__() self.name = "coder" self.description = "Handle code operations like analysis, generation, and refactoring" - self.auth_source = AuthSource.LOCAL # Code operations typically don't need auth + self.authSource = AuthSource.LOCAL # Code operations typically don't need auth @property def actions(self) -> Dict[str, Dict[str, Any]]: @@ -52,37 +52,37 @@ class MethodCoder(MethodBase): } } - async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> MethodResult: + async def execute(self, action: str, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> MethodResult: """Execute coder method""" try: # Validate parameters - if not await self.validate_parameters(action, parameters): - return self._create_result( + if not await self.validateParameters(action, parameters): + return self._createResult( success=False, data={"error": f"Invalid parameters for {action}"} ) # Execute action if action == "analyze": - return await self._analyze_code(parameters) + return await self._analyzeCode(parameters) elif action == "generate": - return await self._generate_code(parameters) + return await self._generateCode(parameters) elif action == "refactor": - return await self._refactor_code(parameters) + return await self._refactorCode(parameters) else: - return self._create_result( + return self._createResult( success=False, data={"error": f"Unknown action: {action}"} ) except Exception as e: logger.error(f"Error executing coder {action}: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": str(e)} ) - async def _analyze_code(self, parameters: Dict[str, Any]) -> MethodResult: + async def _analyzeCode(self, parameters: Dict[str, Any]) -> MethodResult: """Analyze code structure and quality""" try: code = parameters["code"] @@ -121,13 +121,13 @@ class MethodCoder(MethodBase): if "complexity" in metrics: for node in ast.walk(tree): if isinstance(node, ast.FunctionDef): - body_lines = len(node.body) - if body_lines > 20: # Arbitrary threshold + bodyLines = len(node.body) + if bodyLines > 20: # Arbitrary threshold analysis["issues"].append({ "type": "long_function", "line": node.lineno, "name": node.name, - "lines": body_lines + "lines": bodyLines }) # Check for style issues @@ -149,18 +149,18 @@ class MethodCoder(MethodBase): }) except SyntaxError as e: - return self._create_result( + return self._createResult( success=False, data={"error": f"Syntax error: {str(e)}"} ) else: # TODO: Implement analysis for other languages - return self._create_result( + return self._createResult( success=False, data={"error": f"Unsupported language: {language}"} ) - return self._create_result( + return self._createResult( success=True, data={ "language": language, @@ -169,12 +169,12 @@ class MethodCoder(MethodBase): ) except Exception as e: logger.error(f"Error analyzing code: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Analysis failed: {str(e)}"} ) - async def _generate_code(self, parameters: Dict[str, Any]) -> MethodResult: + async def _generateCode(self, parameters: Dict[str, Any]) -> MethodResult: """Generate code based on requirements""" try: requirements = parameters["requirements"] @@ -185,8 +185,8 @@ class MethodCoder(MethodBase): # This is a placeholder implementation if language.lower() == "python": # Generate a simple Python class based on requirements - class_name = re.sub(r'[^a-zA-Z0-9]', '', requirements.split()[0].title()) - code = f"""class {class_name}: + className = re.sub(r'[^a-zA-Z0-9]', '', requirements.split()[0].title()) + code = f"""class {className}: \"\"\" {requirements} \"\"\" @@ -198,12 +198,12 @@ class MethodCoder(MethodBase): pass """ else: - return self._create_result( + return self._createResult( success=False, data={"error": f"Unsupported language: {language}"} ) - return self._create_result( + return self._createResult( success=True, data={ "language": language, @@ -212,12 +212,12 @@ class MethodCoder(MethodBase): ) except Exception as e: logger.error(f"Error generating code: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Generation failed: {str(e)}"} ) - async def _refactor_code(self, parameters: Dict[str, Any]) -> MethodResult: + async def _refactorCode(self, parameters: Dict[str, Any]) -> MethodResult: """Refactor code for better quality""" try: code = parameters["code"] @@ -246,17 +246,17 @@ class MethodCoder(MethodBase): pass except SyntaxError as e: - return self._create_result( + return self._createResult( success=False, data={"error": f"Syntax error: {str(e)}"} ) else: - return self._create_result( + return self._createResult( success=False, data={"error": f"Unsupported language: {language}"} ) - return self._create_result( + return self._createResult( success=True, data={ "language": language, @@ -266,7 +266,7 @@ class MethodCoder(MethodBase): ) except Exception as e: logger.error(f"Error refactoring code: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Refactoring failed: {str(e)}"} ) \ No newline at end of file diff --git a/modules/methods/methodDocument.py b/modules/methods/methodDocument.py index 26aab156..1f669d21 100644 --- a/modules/methods/methodDocument.py +++ b/modules/methods/methodDocument.py @@ -1,287 +1,215 @@ -from typing import Dict, Any, Optional -import logging -import os -from pathlib import Path -import docx -import PyPDF2 -import json -import yaml -import xml.etree.ElementTree as ET -from datetime import datetime, UTC +""" +Document processing method module. +Handles document operations using the document service. +""" -from modules.methods.methodBase import MethodBase, AuthSource, MethodResult +import logging +from typing import Dict, Any, List, Optional +from datetime import datetime + +from modules.interfaces.serviceChatModel import ( + ChatDocument, + TaskDocument, + ExtractedContent, + ContentItem +) +from modules.workflow.managerDocument import DocumentManager +from modules.methods.methodBase import MethodBase logger = logging.getLogger(__name__) class MethodDocument(MethodBase): - """Document method implementation for document operations""" + """Document processing method implementation""" - def __init__(self): - super().__init__() - self.name = "document" - self.description = "Handle document operations like reading, writing, and converting documents" - self.auth_source = AuthSource.LOCAL # Document operations typically don't need auth + def __init__(self, serviceContainer): + """Initialize the document method""" + super().__init__(serviceContainer) + self.documentManager = DocumentManager(serviceContainer) + + async def process(self, action: str, parameters: Dict[str, Any]) -> Dict[str, Any]: + """ + Process document operations - @property - def actions(self) -> Dict[str, Dict[str, Any]]: - """Available actions and their parameters""" - return { - "read": { - "description": "Read document content", - "retryMax": 2, - "timeout": 30, - "parameters": { - "path": {"type": "string", "required": True}, - "format": {"type": "string", "required": False}, - "encoding": {"type": "string", "required": False}, - "includeMetadata": {"type": "boolean", "required": False} - } - }, - "write": { - "description": "Write content to document", - "retryMax": 2, - "timeout": 30, - "parameters": { - "path": {"type": "string", "required": True}, - "content": {"type": "string", "required": True}, - "format": {"type": "string", "required": False}, - "encoding": {"type": "string", "required": False}, - "template": {"type": "string", "required": False} - } - }, - "convert": { - "description": "Convert document between formats", - "retryMax": 2, - "timeout": 60, - "parameters": { - "sourcePath": {"type": "string", "required": True}, - "targetPath": {"type": "string", "required": True}, - "sourceFormat": {"type": "string", "required": False}, - "targetFormat": {"type": "string", "required": False}, - "options": {"type": "object", "required": False} - } - } - } - - async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> MethodResult: - """Execute document method""" - try: - # Validate parameters - if not await self.validate_parameters(action, parameters): - return self._create_result( - success=False, - data={"error": f"Invalid parameters for {action}"} - ) + Args: + action: The action to perform + parameters: Action parameters - # Execute action - if action == "read": - return await self._read_document(parameters) - elif action == "write": - return await self._write_document(parameters) - elif action == "convert": - return await self._convert_document(parameters) + Returns: + Dictionary containing the operation result + + Raises: + ValueError: If action is not supported + """ + try: + if action == "extract": + return await self._extractContent(parameters) + elif action == "analyze": + return await self._analyzeDocument(parameters) + elif action == "summarize": + return await self._summarizeDocument(parameters) else: - return self._create_result( - success=False, - data={"error": f"Unknown action: {action}"} - ) + raise ValueError(f"Unsupported action: {action}") + except Exception as e: + logger.error(f"Error processing document action {action}: {str(e)}") + raise + + async def _extractContent(self, parameters: Dict[str, Any]) -> Dict[str, Any]: + """ + Extract content from a document + + Args: + parameters: Dictionary containing: + - documentId: ID of the document to process + - documentType: Type of document ('ChatDocument' or 'TaskDocument') + + Returns: + Dictionary containing extracted content + """ + try: + documentId = parameters.get("documentId") + documentType = parameters.get("documentType", "ChatDocument") + + if not documentId: + raise ValueError("documentId is required") + + # Get document from database + if documentType == "ChatDocument": + document = await self._getChatDocument(documentId) + if not document: + raise ValueError(f"ChatDocument {documentId} not found") + extracted = await self.documentManager.extractFromChatDocument(document) + else: + document = await self._getTaskDocument(documentId) + if not document: + raise ValueError(f"TaskDocument {documentId} not found") + extracted = await self.documentManager.extractFromTaskDocument(document) + + return { + "success": True, + "content": extracted.dict(), + "metadata": await self.documentManager.getDocumentMetadata(document) + } + + except Exception as e: + logger.error(f"Error extracting content: {str(e)}") + return { + "success": False, + "error": str(e) + } + + async def _analyzeDocument(self, parameters: Dict[str, Any]) -> Dict[str, Any]: + """ + Analyze document content + + Args: + parameters: Dictionary containing: + - documentId: ID of the document to analyze + - documentType: Type of document + - analysisType: Type of analysis to perform + + Returns: + Dictionary containing analysis results + """ + try: + # Extract content first + contentResult = await self._extractContent(parameters) + if not contentResult["success"]: + return contentResult + + # Perform analysis based on type + analysisType = parameters.get("analysisType", "basic") + content = ExtractedContent(**contentResult["content"]) + + if analysisType == "basic": + # Basic analysis: count items, calculate statistics + stats = { + "totalItems": len(content.contents), + "totalSize": sum(item.metadata.size for item in content.contents), + "itemTypes": {} + } + + for item in content.contents: + itemType = item.label + if itemType not in stats["itemTypes"]: + stats["itemTypes"][itemType] = 0 + stats["itemTypes"][itemType] += 1 + + return { + "success": True, + "analysis": stats + } + else: + raise ValueError(f"Unsupported analysis type: {analysisType}") except Exception as e: - logger.error(f"Error executing document {action}: {e}") - return self._create_result( - success=False, - data={"error": str(e)} - ) - - async def _read_document(self, parameters: Dict[str, Any]) -> MethodResult: - """Read document content""" - try: - path = Path(parameters["path"]) - if not path.exists(): - return self._create_result( - success=False, - data={"error": f"File not found: {path}"} - ) - - # Determine format if not specified - format = parameters.get("format") - if not format: - format = path.suffix[1:] if path.suffix else "txt" - - # Read content based on format - content = "" - encoding = parameters.get("encoding", "utf-8") - include_metadata = parameters.get("includeMetadata", False) - - if format.lower() in ["txt", "md"]: - with open(path, "r", encoding=encoding) as f: - content = f.read() - elif format.lower() == "docx": - doc = docx.Document(path) - content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) - elif format.lower() == "pdf": - with open(path, "rb") as f: - pdf = PyPDF2.PdfReader(f) - content = "\n".join([page.extract_text() for page in pdf.pages]) - elif format.lower() == "json": - with open(path, "r", encoding=encoding) as f: - content = json.load(f) - elif format.lower() == "yaml": - with open(path, "r", encoding=encoding) as f: - content = yaml.safe_load(f) - elif format.lower() == "xml": - tree = ET.parse(path) - root = tree.getroot() - content = ET.tostring(root, encoding=encoding).decode(encoding) - else: - return self._create_result( - success=False, - data={"error": f"Unsupported format: {format}"} - ) - - result = { - "path": str(path), - "format": format, - "content": content + logger.error(f"Error analyzing document: {str(e)}") + return { + "success": False, + "error": str(e) } - - if include_metadata: - result["metadata"] = { - "size": path.stat().st_size, - "modified": datetime.fromtimestamp(path.stat().st_mtime, UTC).isoformat(), - "created": datetime.fromtimestamp(path.stat().st_ctime, UTC).isoformat() - } - - return self._create_result( - success=True, - data=result - ) - except Exception as e: - logger.error(f"Error reading document: {e}") - return self._create_result( - success=False, - data={"error": f"Read failed: {str(e)}"} - ) - async def _write_document(self, parameters: Dict[str, Any]) -> MethodResult: - """Write content to document""" + async def _summarizeDocument(self, parameters: Dict[str, Any]) -> Dict[str, Any]: + """ + Generate document summary + + Args: + parameters: Dictionary containing: + - documentId: ID of the document to summarize + - documentType: Type of document + - summaryType: Type of summary to generate + + Returns: + Dictionary containing summary + """ try: - path = Path(parameters["path"]) + # Extract content first + contentResult = await self._extractContent(parameters) + if not contentResult["success"]: + return contentResult - # Create directory if it doesn't exist - path.parent.mkdir(parents=True, exist_ok=True) + # Generate summary based on type + summaryType = parameters.get("summaryType", "basic") + content = ExtractedContent(**contentResult["content"]) - # Determine format if not specified - format = parameters.get("format") - if not format: - format = path.suffix[1:] if path.suffix else "txt" - - # Write content based on format - encoding = parameters.get("encoding", "utf-8") - content = parameters["content"] - template = parameters.get("template") - - if format.lower() in ["txt", "md"]: - with open(path, "w", encoding=encoding) as f: - f.write(content) - elif format.lower() == "docx": - if template: - doc = docx.Document(template) - else: - doc = docx.Document() - doc.add_paragraph(content) - doc.save(path) - elif format.lower() == "pdf": - # TODO: Implement PDF writing - return self._create_result( - success=False, - data={"error": "PDF writing not implemented yet"} + if summaryType == "basic": + # Basic summary: concatenate all text content + summary = "\n".join( + item.data for item in content.contents + if item.label == "main" ) - elif format.lower() == "json": - with open(path, "w", encoding=encoding) as f: - json.dump(content, f, indent=2) - elif format.lower() == "yaml": - with open(path, "w", encoding=encoding) as f: - yaml.dump(content, f) - elif format.lower() == "xml": - with open(path, "w", encoding=encoding) as f: - f.write(content) + + return { + "success": True, + "summary": summary + } else: - return self._create_result( - success=False, - data={"error": f"Unsupported format: {format}"} - ) - - return self._create_result( - success=True, - data={ - "path": str(path), - "format": format, - "size": path.stat().st_size, - "modified": datetime.now(UTC).isoformat() - } - ) + raise ValueError(f"Unsupported summary type: {summaryType}") + except Exception as e: - logger.error(f"Error writing document: {e}") - return self._create_result( - success=False, - data={"error": f"Write failed: {str(e)}"} - ) + logger.error(f"Error summarizing document: {str(e)}") + return { + "success": False, + "error": str(e) + } - async def _convert_document(self, parameters: Dict[str, Any]) -> MethodResult: - """Convert document between formats""" + async def _getChatDocument(self, documentId: str) -> Optional[ChatDocument]: + """Get ChatDocument from database""" try: - source_path = Path(parameters["sourcePath"]) - target_path = Path(parameters["targetPath"]) - - if not source_path.exists(): - return self._create_result( - success=False, - data={"error": f"Source file not found: {source_path}"} - ) - - # Determine formats if not specified - source_format = parameters.get("sourceFormat") - if not source_format: - source_format = source_path.suffix[1:] if source_path.suffix else "txt" - - target_format = parameters.get("targetFormat") - if not target_format: - target_format = target_path.suffix[1:] if target_path.suffix else "txt" - - # Read source content - source_content = await self._read_document({ - "path": str(source_path), - "format": source_format - }) - - if not source_content.success: - return source_content - - # Write target content - target_content = await self._write_document({ - "path": str(target_path), - "content": source_content.data["content"], - "format": target_format - }) - - if not target_content.success: - return target_content - - return self._create_result( - success=True, - data={ - "sourcePath": str(source_path), - "targetPath": str(target_path), - "sourceFormat": source_format, - "targetFormat": target_format, - "size": target_path.stat().st_size, - "modified": datetime.now(UTC).isoformat() - } - ) + documentData = self.service.db.getRecord("chatDocuments", documentId) + if documentData: + return ChatDocument(**documentData) + return None except Exception as e: - logger.error(f"Error converting document: {e}") - return self._create_result( - success=False, - data={"error": f"Conversion failed: {str(e)}"} - ) \ No newline at end of file + logger.error(f"Error getting ChatDocument {documentId}: {str(e)}") + return None + + async def _getTaskDocument(self, documentId: str) -> Optional[TaskDocument]: + """Get TaskDocument from database""" + try: + documentData = self.service.db.getRecord("taskDocuments", documentId) + if documentData: + return TaskDocument(**documentData) + return None + except Exception as e: + logger.error(f"Error getting TaskDocument {documentId}: {str(e)}") + return None \ No newline at end of file diff --git a/modules/methods/methodOutlook.py b/modules/methods/methodOutlook.py index 908cbb3f..8391ec96 100644 --- a/modules/methods/methodOutlook.py +++ b/modules/methods/methodOutlook.py @@ -15,7 +15,7 @@ class MethodOutlook(MethodBase): super().__init__() self.name = "outlook" self.description = "Handle Outlook email operations like reading and sending emails" - self.auth_source = AuthSource.MICROSOFT + self.authSource = AuthSource.MICROSOFT @property def actions(self) -> Dict[str, Dict[str, Any]]: @@ -47,54 +47,54 @@ class MethodOutlook(MethodBase): } } - async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> MethodResult: + async def execute(self, action: str, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> MethodResult: """Execute Outlook method""" try: # Validate parameters - if not await self.validate_parameters(action, parameters): - return self._create_result( + if not await self.validateParameters(action, parameters): + return self._createResult( success=False, data={"error": f"Invalid parameters for {action}"} ) # Get UserConnection from auth_data - if not auth_data or "userConnection" not in auth_data: - return self._create_result( + if not authData or "userConnection" not in authData: + return self._createResult( success=False, data={"error": "UserConnection required for Outlook operations"} ) - user_connection: UserConnection = auth_data["userConnection"] + userConnection: UserConnection = authData["userConnection"] # Execute action if action == "readMails": - return await self._read_mails(parameters, user_connection) + return await self._readMails(parameters, userConnection) elif action == "sendMail": - return await self._send_mail(parameters, user_connection) + return await self._sendMail(parameters, userConnection) else: - return self._create_result( + return self._createResult( success=False, data={"error": f"Unknown action: {action}"} ) except Exception as e: logger.error(f"Error executing Outlook {action}: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": str(e)} ) - async def _read_mails(self, parameters: Dict[str, Any], user_connection: UserConnection) -> MethodResult: + async def _readMails(self, parameters: Dict[str, Any], userConnection: UserConnection) -> MethodResult: """Read emails from Outlook""" try: folder = parameters.get("folder", "inbox") query = parameters.get("query") - max_results = parameters.get("maxResults", 10) - include_attachments = parameters.get("includeAttachments", False) + maxResults = parameters.get("maxResults", 10) + includeAttachments = parameters.get("includeAttachments", False) # Create Outlook account account = Account( - credentials=(user_connection.authToken, user_connection.refreshToken), + credentials=(userConnection.authToken, userConnection.refreshToken), protocol=MSGraphProtocol() ) @@ -102,18 +102,18 @@ class MethodOutlook(MethodBase): mailbox = account.mailbox() # Get folder - target_folder = mailbox.folder(folder_name=folder) + targetFolder = mailbox.folder(folder_name=folder) # Get messages if query: - messages = target_folder.get_messages(query=query, limit=max_results) + messages = targetFolder.get_messages(query=query, limit=maxResults) else: - messages = target_folder.get_messages(limit=max_results) + messages = targetFolder.get_messages(limit=maxResults) # Process messages results = [] for message in messages: - msg_data = { + msgData = { "id": message.object_id, "subject": message.subject, "from": message.sender.address, @@ -124,7 +124,7 @@ class MethodOutlook(MethodBase): "hasAttachments": message.has_attachments } - if include_attachments and message.has_attachments: + if includeAttachments and message.has_attachments: attachments = [] for attachment in message.attachments: attachments.append({ @@ -132,11 +132,11 @@ class MethodOutlook(MethodBase): "contentType": attachment.content_type, "size": attachment.size }) - msg_data["attachments"] = attachments + msgData["attachments"] = attachments - results.append(msg_data) + results.append(msgData) - return self._create_result( + return self._createResult( success=True, data={ "folder": folder, @@ -146,24 +146,24 @@ class MethodOutlook(MethodBase): ) except Exception as e: logger.error(f"Error reading Outlook emails: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Read failed: {str(e)}"} ) - async def _send_mail(self, parameters: Dict[str, Any], user_connection: UserConnection) -> MethodResult: + async def _sendMail(self, parameters: Dict[str, Any], userConnection: UserConnection) -> MethodResult: """Send email through Outlook""" try: - to_addresses = parameters["to"] + toAddresses = parameters["to"] subject = parameters["subject"] body = parameters["body"] - cc_addresses = parameters.get("cc", []) - bcc_addresses = parameters.get("bcc", []) + ccAddresses = parameters.get("cc", []) + bccAddresses = parameters.get("bcc", []) attachments = parameters.get("attachments", []) # Create Outlook account account = Account( - credentials=(user_connection.authToken, user_connection.refreshToken), + credentials=(userConnection.authToken, userConnection.refreshToken), protocol=MSGraphProtocol() ) @@ -172,32 +172,32 @@ class MethodOutlook(MethodBase): # Create new message message = mailbox.new_message() - message.to.add(to_addresses) - if cc_addresses: - message.cc.add(cc_addresses) - if bcc_addresses: - message.bcc.add(bcc_addresses) + message.to.add(toAddresses) + if ccAddresses: + message.cc.add(ccAddresses) + if bccAddresses: + message.bcc.add(bccAddresses) message.subject = subject message.body = body # Add attachments - for attachment_path in attachments: - message.attachments.add(attachment_path) + for attachmentPath in attachments: + message.attachments.add(attachmentPath) # Send message message.send() - return self._create_result( + return self._createResult( success=True, data={ - "to": to_addresses, + "to": toAddresses, "subject": subject, "sent": datetime.now(UTC).isoformat() } ) except Exception as e: logger.error(f"Error sending Outlook email: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Send failed: {str(e)}"} ) \ No newline at end of file diff --git a/modules/methods/methodPowerpoint.py b/modules/methods/methodPowerpoint.py index bed7abc9..f85915fa 100644 --- a/modules/methods/methodPowerpoint.py +++ b/modules/methods/methodPowerpoint.py @@ -4,6 +4,9 @@ import os from pathlib import Path from modules.methods.methodBase import MethodBase, AuthSource, MethodResult +from modules.models.userConnection import UserConnection +from modules.models.account import Account +from modules.protocols.msGraphProtocol import MSGraphProtocol logger = logging.getLogger(__name__) @@ -14,7 +17,7 @@ class MethodPowerpoint(MethodBase): super().__init__() self.name = "powerpoint" self.description = "Handle PowerPoint operations like reading, writing, and converting presentations" - self.auth_source = AuthSource.MICROSOFT # PowerPoint operations need Microsoft auth + self.authSource = AuthSource.MICROSOFT # PowerPoint operations need Microsoft auth @property def actions(self) -> Dict[str, Dict[str, Any]]: @@ -50,52 +53,85 @@ class MethodPowerpoint(MethodBase): "sourceFormat": {"type": "string", "required": False}, "targetFormat": {"type": "string", "required": False} } + }, + "createPresentation": { + "description": "Create a new PowerPoint presentation", + "retryMax": 2, + "timeout": 60, + "parameters": { + "title": {"type": "string", "required": True}, + "template": {"type": "string", "required": False} + } + }, + "addSlide": { + "description": "Add a new slide to presentation", + "retryMax": 2, + "timeout": 60, + "parameters": { + "presentationId": {"type": "string", "required": True}, + "layout": {"type": "string", "required": False}, + "title": {"type": "string", "required": False} + } + }, + "addContent": { + "description": "Add content to a slide", + "retryMax": 2, + "timeout": 60, + "parameters": { + "presentationId": {"type": "string", "required": True}, + "slideId": {"type": "string", "required": True}, + "contentType": {"type": "string", "required": True}, + "content": {"type": "object", "required": True}, + "position": {"type": "object", "required": False} + } } } - async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> MethodResult: - """Execute powerpoint method""" + async def execute(self, action: str, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> MethodResult: + """Execute PowerPoint method""" try: # Validate parameters - if not await self.validate_parameters(action, parameters): - return self._create_result( + if not await self.validateParameters(action, parameters): + return self._createResult( success=False, data={"error": f"Invalid parameters for {action}"} ) - # Validate authentication - if not await self.validate_auth(auth_data): - return self._create_result( + # Get UserConnection from auth_data + if not authData or "userConnection" not in authData: + return self._createResult( success=False, - data={"error": "Authentication required for PowerPoint operations"} + data={"error": "UserConnection required for PowerPoint operations"} ) + userConnection: UserConnection = authData["userConnection"] + # Execute action - if action == "read": - return await self._read_presentation(parameters, auth_data) - elif action == "write": - return await self._write_presentation(parameters, auth_data) - elif action == "convert": - return await self._convert_presentation(parameters, auth_data) + if action == "createPresentation": + return await self._createPresentation(parameters, userConnection) + elif action == "addSlide": + return await self._addSlide(parameters, userConnection) + elif action == "addContent": + return await self._addContent(parameters, userConnection) else: - return self._create_result( + return self._createResult( success=False, data={"error": f"Unknown action: {action}"} ) except Exception as e: - logger.error(f"Error executing powerpoint {action}: {e}") - return self._create_result( + logger.error(f"Error executing PowerPoint {action}: {e}") + return self._createResult( success=False, data={"error": str(e)} ) - async def _read_presentation(self, parameters: Dict[str, Any], auth_data: Dict[str, Any]) -> MethodResult: + async def _read_presentation(self, parameters: Dict[str, Any], authData: Dict[str, Any]) -> MethodResult: """Read PowerPoint presentation content""" try: path = Path(parameters["path"]) if not path.exists(): - return self._create_result( + return self._createResult( success=False, data={"error": f"File not found: {path}"} ) @@ -107,7 +143,7 @@ class MethodPowerpoint(MethodBase): # TODO: Implement PowerPoint reading using Microsoft Graph API # This is a placeholder implementation - return self._create_result( + return self._createResult( success=True, data={ "path": str(path), @@ -124,12 +160,12 @@ class MethodPowerpoint(MethodBase): ) except Exception as e: logger.error(f"Error reading presentation: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Read failed: {str(e)}"} ) - async def _write_presentation(self, parameters: Dict[str, Any], auth_data: Dict[str, Any]) -> MethodResult: + async def _write_presentation(self, parameters: Dict[str, Any], authData: Dict[str, Any]) -> MethodResult: """Write content to PowerPoint presentation""" try: path = Path(parameters["path"]) @@ -144,7 +180,7 @@ class MethodPowerpoint(MethodBase): # TODO: Implement PowerPoint writing using Microsoft Graph API # This is a placeholder implementation - return self._create_result( + return self._createResult( success=True, data={ "path": str(path), @@ -154,19 +190,19 @@ class MethodPowerpoint(MethodBase): ) except Exception as e: logger.error(f"Error writing presentation: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Write failed: {str(e)}"} ) - async def _convert_presentation(self, parameters: Dict[str, Any], auth_data: Dict[str, Any]) -> MethodResult: + async def _convert_presentation(self, parameters: Dict[str, Any], authData: Dict[str, Any]) -> MethodResult: """Convert PowerPoint presentation between formats""" try: source_path = Path(parameters["sourcePath"]) target_path = Path(parameters["targetPath"]) if not source_path.exists(): - return self._create_result( + return self._createResult( success=False, data={"error": f"Source file not found: {source_path}"} ) @@ -182,7 +218,7 @@ class MethodPowerpoint(MethodBase): # TODO: Implement PowerPoint conversion using Microsoft Graph API # This is a placeholder implementation - return self._create_result( + return self._createResult( success=True, data={ "sourcePath": str(source_path), @@ -193,7 +229,148 @@ class MethodPowerpoint(MethodBase): ) except Exception as e: logger.error(f"Error converting presentation: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Conversion failed: {str(e)}"} + ) + + async def _createPresentation(self, parameters: Dict[str, Any], userConnection: UserConnection) -> MethodResult: + """Create a new PowerPoint presentation""" + try: + title = parameters["title"] + template = parameters.get("template") + + # Create PowerPoint account + account = Account( + credentials=(userConnection.authToken, userConnection.refreshToken), + protocol=MSGraphProtocol() + ) + + # Get drive + drive = account.drive() + + # Create presentation + if template: + # Copy template + templateFile = drive.get_item_by_path(template) + newFile = templateFile.copy(f"{title}.pptx") + else: + # Create blank presentation + newFile = drive.create_file( + name=f"{title}.pptx", + content_type="application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + + return self._createResult( + success=True, + data={ + "id": newFile.object_id, + "name": newFile.name, + "webUrl": newFile.web_url + } + ) + except Exception as e: + logger.error(f"Error creating PowerPoint presentation: {e}") + return self._createResult( + success=False, + data={"error": f"Create failed: {str(e)}"} + ) + + async def _addSlide(self, parameters: Dict[str, Any], userConnection: UserConnection) -> MethodResult: + """Add a new slide to presentation""" + try: + presentationId = parameters["presentationId"] + layout = parameters.get("layout", "title") + title = parameters.get("title") + + # Create PowerPoint account + account = Account( + credentials=(userConnection.authToken, userConnection.refreshToken), + protocol=MSGraphProtocol() + ) + + # Get drive + drive = account.drive() + + # Get presentation + presentation = drive.get_item_by_id(presentationId) + + # Add slide + slide = presentation.add_slide(layout=layout) + if title: + slide.title = title + + return self._createResult( + success=True, + data={ + "slideId": slide.object_id, + "layout": layout, + "title": title + } + ) + except Exception as e: + logger.error(f"Error adding PowerPoint slide: {e}") + return self._createResult( + success=False, + data={"error": f"Add slide failed: {str(e)}"} + ) + + async def _addContent(self, parameters: Dict[str, Any], userConnection: UserConnection) -> MethodResult: + """Add content to a slide""" + try: + presentationId = parameters["presentationId"] + slideId = parameters["slideId"] + contentType = parameters["contentType"] + content = parameters["content"] + position = parameters.get("position", {"x": 0, "y": 0}) + + # Create PowerPoint account + account = Account( + credentials=(userConnection.authToken, userConnection.refreshToken), + protocol=MSGraphProtocol() + ) + + # Get drive + drive = account.drive() + + # Get presentation and slide + presentation = drive.get_item_by_id(presentationId) + slide = presentation.get_slide(slideId) + + # Add content based on type + if contentType == "text": + shape = slide.add_text_box( + text=content, + left=position["x"], + top=position["y"] + ) + elif contentType == "image": + shape = slide.add_picture( + image_path=content, + left=position["x"], + top=position["y"] + ) + elif contentType == "table": + shape = slide.add_table( + rows=content["rows"], + cols=content["cols"], + left=position["x"], + top=position["y"] + ) + else: + raise ValueError(f"Unsupported content type: {contentType}") + + return self._createResult( + success=True, + data={ + "shapeId": shape.object_id, + "contentType": contentType, + "position": position + } + ) + except Exception as e: + logger.error(f"Error adding PowerPoint content: {e}") + return self._createResult( + success=False, + data={"error": f"Add content failed: {str(e)}"} ) \ No newline at end of file diff --git a/modules/methods/methodSharepoint.py b/modules/methods/methodSharepoint.py index 893dccfa..06cd4ee8 100644 --- a/modules/methods/methodSharepoint.py +++ b/modules/methods/methodSharepoint.py @@ -19,7 +19,7 @@ class MethodSharepoint(MethodBase): super().__init__() self.name = "sharepoint" self.description = "Handle SharePoint document operations like search, read, and write" - self.auth_source = AuthSource.MICROSOFT + self.authSource = AuthSource.MICROSOFT @property def actions(self) -> Dict[str, Dict[str, Any]]: @@ -55,65 +55,104 @@ class MethodSharepoint(MethodBase): "content": {"type": "string", "required": True}, "contentType": {"type": "string", "required": False} } + }, + "readList": { + "description": "Read items from SharePoint list", + "retryMax": 2, + "timeout": 30, + "parameters": { + "siteUrl": {"type": "string", "required": True}, + "listName": {"type": "string", "required": True}, + "query": {"type": "string", "required": False}, + "fields": {"type": "array", "required": False} + } + }, + "writeList": { + "description": "Write items to SharePoint list", + "retryMax": 2, + "timeout": 30, + "parameters": { + "siteUrl": {"type": "string", "required": True}, + "listName": {"type": "string", "required": True}, + "items": {"type": "array", "required": True} + } + }, + "createList": { + "description": "Create a new SharePoint list", + "retryMax": 2, + "timeout": 30, + "parameters": { + "siteUrl": {"type": "string", "required": True}, + "listName": {"type": "string", "required": True}, + "description": {"type": "string", "required": False}, + "template": {"type": "string", "required": False}, + "fields": {"type": "array", "required": False} + } } } - async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> MethodResult: + async def execute(self, action: str, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> MethodResult: """Execute SharePoint method""" try: # Validate parameters - if not await self.validate_parameters(action, parameters): - return self._create_result( + if not await self.validateParameters(action, parameters): + return self._createResult( success=False, data={"error": f"Invalid parameters for {action}"} ) # Get UserConnection from auth_data - if not auth_data or "userConnection" not in auth_data: - return self._create_result( + if not authData or "userConnection" not in authData: + return self._createResult( success=False, data={"error": "UserConnection required for SharePoint operations"} ) - user_connection: UserConnection = auth_data["userConnection"] + userConnection: UserConnection = authData["userConnection"] # Execute action if action == "search": - return await self._search_documents(parameters, user_connection) + return await self._search_documents(parameters, userConnection) elif action == "read": - return await self._read_document(parameters, user_connection) + return await self._read_document(parameters, userConnection) elif action == "write": - return await self._write_document(parameters, user_connection) + return await self._write_document(parameters, userConnection) + elif action == "readList": + return await self._readList(parameters, userConnection) + elif action == "writeList": + return await self._writeList(parameters, userConnection) + elif action == "createList": + return await self._createList(parameters, userConnection) else: - return self._create_result( + return self._createResult( success=False, data={"error": f"Unknown action: {action}"} ) except Exception as e: logger.error(f"Error executing SharePoint {action}: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": str(e)} ) - async def _search_documents(self, parameters: Dict[str, Any], user_connection: UserConnection) -> MethodResult: + async def _search_documents(self, parameters: Dict[str, Any], userConnection: UserConnection) -> MethodResult: """Search SharePoint documents""" try: - site_url = parameters["siteUrl"] + siteUrl = parameters["siteUrl"] query = parameters["query"] - list_name = parameters.get("listName") - max_results = parameters.get("maxResults", 10) + listName = parameters.get("listName") + maxResults = parameters.get("maxResults", 10) # Create SharePoint context - ctx = ClientContext(site_url).with_credentials( - UserCredential(user_connection.authToken, user_connection.refreshToken) + ctx = ClientContext(siteUrl).with_credentials( + UserCredential(userConnection.authToken, userConnection.refreshToken) ) # Search in specific list or entire site - if list_name: - target_list = ctx.web.lists.get_by_title(list_name) - items = target_list.items.filter(f"Title eq '{query}'").top(max_results).get().execute_query() + if listName: + targetList = ctx.web.lists.get_by_title(listName) + items = targetList.items.filter(f"Title eq '{query}'").top(maxResults).get().execute_query() results = [{ "title": item.properties["Title"], "url": item.properties["FileRef"], @@ -128,9 +167,9 @@ class MethodSharepoint(MethodBase): "url": result.properties["Path"], "modified": result.properties["LastModifiedTime"], "created": result.properties["Created"] - } for result in search_results[:max_results]] + } for result in search_results[:maxResults]] - return self._create_result( + return self._createResult( success=True, data={ "query": query, @@ -139,30 +178,30 @@ class MethodSharepoint(MethodBase): ) except Exception as e: logger.error(f"Error searching SharePoint documents: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Search failed: {str(e)}"} ) - async def _read_document(self, parameters: Dict[str, Any], user_connection: UserConnection) -> MethodResult: + async def _read_document(self, parameters: Dict[str, Any], userConnection: UserConnection) -> MethodResult: """Read SharePoint document content""" try: - site_url = parameters["siteUrl"] - file_url = parameters["fileUrl"] + siteUrl = parameters["siteUrl"] + fileUrl = parameters["fileUrl"] # Create SharePoint context - ctx = ClientContext(site_url).with_credentials( - UserCredential(user_connection.authToken, user_connection.refreshToken) + ctx = ClientContext(siteUrl).with_credentials( + UserCredential(userConnection.authToken, userConnection.refreshToken) ) # Get file - file = ctx.web.get_file_by_server_relative_url(file_url) + file = ctx.web.get_file_by_server_relative_url(fileUrl) file_content = file.read().execute_query() - return self._create_result( + return self._createResult( success=True, data={ - "url": file_url, + "url": fileUrl, "content": file_content.content.decode('utf-8'), "modified": file.properties["TimeLastModified"], "size": file.properties["Length"] @@ -170,48 +209,182 @@ class MethodSharepoint(MethodBase): ) except Exception as e: logger.error(f"Error reading SharePoint document: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Read failed: {str(e)}"} ) - async def _write_document(self, parameters: Dict[str, Any], user_connection: UserConnection) -> MethodResult: + async def _write_document(self, parameters: Dict[str, Any], userConnection: UserConnection) -> MethodResult: """Write content to SharePoint document""" try: - site_url = parameters["siteUrl"] - file_url = parameters["fileUrl"] + siteUrl = parameters["siteUrl"] + fileUrl = parameters["fileUrl"] content = parameters["content"] - content_type = parameters.get("contentType", "text/plain") + contentType = parameters.get("contentType", "text/plain") # Create SharePoint context - ctx = ClientContext(site_url).with_credentials( - UserCredential(user_connection.authToken, user_connection.refreshToken) + ctx = ClientContext(siteUrl).with_credentials( + UserCredential(userConnection.authToken, userConnection.refreshToken) ) # Get or create file try: - file = ctx.web.get_file_by_server_relative_url(file_url) + file = ctx.web.get_file_by_server_relative_url(fileUrl) except: # Create new file - folder_url = "/".join(file_url.split("/")[:-1]) - file_name = file_url.split("/")[-1] - folder = ctx.web.get_folder_by_server_relative_url(folder_url) - file = folder.upload_file(file_name, content.encode('utf-8')).execute_query() + folderUrl = "/".join(fileUrl.split("/")[:-1]) + fileName = fileUrl.split("/")[-1] + folder = ctx.web.get_folder_by_server_relative_url(folderUrl) + file = folder.upload_file(fileName, content.encode('utf-8')).execute_query() # Update file content file.write(content.encode('utf-8')).execute_query() - return self._create_result( + return self._createResult( success=True, data={ - "url": file_url, + "url": fileUrl, "modified": datetime.now(UTC).isoformat(), "size": len(content.encode('utf-8')) } ) except Exception as e: logger.error(f"Error writing SharePoint document: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Write failed: {str(e)}"} + ) + + async def _readList(self, parameters: Dict[str, Any], userConnection: UserConnection) -> MethodResult: + """Read items from SharePoint list""" + try: + siteUrl = parameters["siteUrl"] + listName = parameters["listName"] + query = parameters.get("query") + fields = parameters.get("fields", ["*"]) + + # Create SharePoint account + account = Account( + credentials=(userConnection.authToken, userConnection.refreshToken), + protocol=MSGraphProtocol() + ) + + # Get site + site = account.get_site(siteUrl) + + # Get list + list = site.get_list(listName) + + # Get items + if query: + items = list.get_items(query=query, fields=fields) + else: + items = list.get_items(fields=fields) + + return self._createResult( + success=True, + data={ + "siteUrl": siteUrl, + "listName": listName, + "items": items + } + ) + except Exception as e: + logger.error(f"Error reading SharePoint list: {e}") + return self._createResult( + success=False, + data={"error": f"Read failed: {str(e)}"} + ) + + async def _writeList(self, parameters: Dict[str, Any], userConnection: UserConnection) -> MethodResult: + """Write items to SharePoint list""" + try: + siteUrl = parameters["siteUrl"] + listName = parameters["listName"] + items = parameters["items"] + + # Create SharePoint account + account = Account( + credentials=(userConnection.authToken, userConnection.refreshToken), + protocol=MSGraphProtocol() + ) + + # Get site + site = account.get_site(siteUrl) + + # Get list + list = site.get_list(listName) + + # Add items + results = [] + for item in items: + result = list.add_item(item) + results.append({ + "id": result.id, + "status": "success" + }) + + return self._createResult( + success=True, + data={ + "siteUrl": siteUrl, + "listName": listName, + "results": results + } + ) + except Exception as e: + logger.error(f"Error writing to SharePoint list: {e}") + return self._createResult( + success=False, + data={"error": f"Write failed: {str(e)}"} + ) + + async def _createList(self, parameters: Dict[str, Any], userConnection: UserConnection) -> MethodResult: + """Create a new SharePoint list""" + try: + siteUrl = parameters["siteUrl"] + listName = parameters["listName"] + description = parameters.get("description") + template = parameters.get("template", "generic") + fields = parameters.get("fields", []) + + # Create SharePoint account + account = Account( + credentials=(userConnection.authToken, userConnection.refreshToken), + protocol=MSGraphProtocol() + ) + + # Get site + site = account.get_site(siteUrl) + + # Create list + list = site.create_list( + name=listName, + description=description, + template=template + ) + + # Add fields + for field in fields: + list.add_field( + name=field["name"], + field_type=field["type"], + required=field.get("required", False), + description=field.get("description") + ) + + return self._createResult( + success=True, + data={ + "siteUrl": siteUrl, + "listName": listName, + "id": list.id, + "webUrl": list.web_url + } + ) + except Exception as e: + logger.error(f"Error creating SharePoint list: {e}") + return self._createResult( + success=False, + data={"error": f"Create failed: {str(e)}"} ) \ No newline at end of file diff --git a/modules/methods/methodWeb.py b/modules/methods/methodWeb.py index cf968ed4..437bca37 100644 --- a/modules/methods/methodWeb.py +++ b/modules/methods/methodWeb.py @@ -8,6 +8,7 @@ import re from datetime import datetime, UTC import requests import time +import json from modules.methods.methodBase import MethodBase, AuthSource, MethodResult from modules.shared.configuration import APP_CONFIG @@ -74,70 +75,197 @@ class MethodWeb(MethodBase): } } - async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> MethodResult: + async def execute(self, action: str, parameters: Dict[str, Any], authData: Optional[Dict[str, Any]] = None) -> MethodResult: """Execute web method""" try: # Validate parameters - if not await self.validate_parameters(action, parameters): - return self._create_result( + if not await self.validateParameters(action, parameters): + return self._createResult( success=False, data={"error": f"Invalid parameters for {action}"} ) # Execute action - if action == "search": - return await self._search_web(parameters) - elif action == "crawl": - return await self._crawl_page(parameters) - elif action == "extract": - return await self._extract_content(parameters) + if action == "fetchUrl": + return await self._fetchUrl(parameters) + elif action == "parseContent": + return await self._parseContent(parameters) + elif action == "extractData": + return await self._extractData(parameters) else: - return self._create_result( + return self._createResult( success=False, data={"error": f"Unknown action: {action}"} ) except Exception as e: logger.error(f"Error executing web {action}: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": str(e)} ) + async def _fetchUrl(self, parameters: Dict[str, Any]) -> MethodResult: + """Fetch content from URL""" + try: + url = parameters["url"] + method = parameters.get("method", "GET") + headers = parameters.get("headers", {}) + data = parameters.get("data") + timeout = parameters.get("timeout", 30) + + async with aiohttp.ClientSession() as session: + async with session.request( + method=method, + url=url, + headers=headers, + data=data, + timeout=timeout + ) as response: + content = await response.text() + return self._createResult( + success=True, + data={ + "url": url, + "status": response.status, + "headers": dict(response.headers), + "content": content + } + ) + except Exception as e: + logger.error(f"Error fetching URL: {e}") + return self._createResult( + success=False, + data={"error": f"Fetch failed: {str(e)}"} + ) + + async def _parseContent(self, parameters: Dict[str, Any]) -> MethodResult: + """Parse web content""" + try: + content = parameters["content"] + contentType = parameters.get("contentType", "html") + + if contentType == "html": + soup = BeautifulSoup(content, "html.parser") + return self._createResult( + success=True, + data={ + "type": "html", + "title": soup.title.string if soup.title else None, + "text": soup.get_text(), + "links": [a.get("href") for a in soup.find_all("a", href=True)], + "images": [img.get("src") for img in soup.find_all("img", src=True)] + } + ) + elif contentType == "json": + data = json.loads(content) + return self._createResult( + success=True, + data={ + "type": "json", + "data": data + } + ) + else: + raise ValueError(f"Unsupported content type: {contentType}") + except Exception as e: + logger.error(f"Error parsing content: {e}") + return self._createResult( + success=False, + data={"error": f"Parse failed: {str(e)}"} + ) + + async def _extractData(self, parameters: Dict[str, Any]) -> MethodResult: + """Extract data from web content""" + try: + content = parameters["content"] + contentType = parameters.get("contentType", "html") + selectors = parameters["selectors"] + + if contentType == "html": + soup = BeautifulSoup(content, "html.parser") + results = {} + + for key, selector in selectors.items(): + elements = soup.select(selector) + if len(elements) == 1: + results[key] = elements[0].get_text().strip() + else: + results[key] = [el.get_text().strip() for el in elements] + + return self._createResult( + success=True, + data={ + "type": "html", + "results": results + } + ) + elif contentType == "json": + data = json.loads(content) + results = {} + + for key, path in selectors.items(): + value = data + for part in path.split("."): + if isinstance(value, dict): + value = value.get(part) + elif isinstance(value, list) and part.isdigit(): + value = value[int(part)] + else: + value = None + break + results[key] = value + + return self._createResult( + success=True, + data={ + "type": "json", + "results": results + } + ) + else: + raise ValueError(f"Unsupported content type: {contentType}") + except Exception as e: + logger.error(f"Error extracting data: {e}") + return self._createResult( + success=False, + data={"error": f"Extract failed: {str(e)}"} + ) + async def _search_web(self, parameters: Dict[str, Any]) -> MethodResult: """Search web content""" try: query = parameters["query"] - max_results = parameters.get("maxResults", 10) + maxResults = parameters.get("maxResults", 10) filters = parameters.get("filters", {}) - search_engine = parameters.get("searchEngine", "google") + searchEngine = parameters.get("searchEngine", "google") # Implement search using different engines - if search_engine.lower() == "google": + if searchEngine.lower() == "google": # Use Google Custom Search API # TODO: Implement Google Custom Search API integration - results = await self._google_search(query, max_results, filters) - elif search_engine.lower() == "bing": + results = await self._google_search(query, maxResults, filters) + elif searchEngine.lower() == "bing": # Use Bing Web Search API # TODO: Implement Bing Web Search API integration - results = await self._bing_search(query, max_results, filters) + results = await self._bing_search(query, maxResults, filters) else: - return self._create_result( + return self._createResult( success=False, - data={"error": f"Unsupported search engine: {search_engine}"} + data={"error": f"Unsupported search engine: {searchEngine}"} ) - return self._create_result( + return self._createResult( success=True, data={ "query": query, - "engine": search_engine, + "engine": searchEngine, "results": results } ) except Exception as e: logger.error(f"Error searching web: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Search failed: {str(e)}"} ) @@ -173,14 +301,14 @@ class MethodWeb(MethodBase): try: url = parameters["url"] depth = parameters.get("depth", 1) - follow_links = parameters.get("followLinks", False) - include_images = parameters.get("includeImages", False) - respect_robots = parameters.get("respectRobots", True) + followLinks = parameters.get("followLinks", False) + includeImages = parameters.get("includeImages", False) + respectRobots = parameters.get("respectRobots", True) # Check robots.txt if required - if respect_robots: + if respectRobots: if not await self._check_robots_txt(url): - return self._create_result( + return self._createResult( success=False, data={"error": "Crawling not allowed by robots.txt"} ) @@ -198,114 +326,57 @@ class MethodWeb(MethodBase): "title": soup.title.string if soup.title else None, "description": self._get_meta_description(soup), "links": [], - "images": [] if include_images else None, + "images": [] if includeImages else None, "text": soup.get_text(strip=True), "crawled": datetime.now(UTC).isoformat() } # Extract links if followLinks is True - if follow_links: - base_url = url + if followLinks: + baseUrl = url for link in soup.find_all('a'): href = link.get('href') if href: - absolute_url = urljoin(base_url, href) - if self._is_valid_url(absolute_url): + absoluteUrl = urljoin(baseUrl, href) + if self._is_valid_url(absoluteUrl): result["links"].append({ - "url": absolute_url, + "url": absoluteUrl, "text": link.get_text(strip=True) }) # Extract images if includeImages is True - if include_images: + if includeImages: for img in soup.find_all('img'): src = img.get('src') if src: - absolute_src = urljoin(url, src) + absoluteSrc = urljoin(url, src) result["images"].append({ - "url": absolute_src, + "url": absoluteSrc, "alt": img.get('alt', ''), "title": img.get('title', '') }) - return self._create_result( + return self._createResult( success=True, data=result ) else: - return self._create_result( + return self._createResult( success=False, data={"error": f"Failed to fetch URL: {response.status}"} ) except Exception as e: logger.error(f"Error crawling page: {e}") - return self._create_result( + return self._createResult( success=False, data={"error": f"Crawl failed: {str(e)}"} ) - async def _extract_content(self, parameters: Dict[str, Any]) -> MethodResult: - """Extract content from web page""" - try: - url = parameters["url"] - selectors = parameters.get("selectors") - format = parameters.get("format", "text") - include_metadata = parameters.get("includeMetadata", False) - - async with aiohttp.ClientSession() as session: - async with session.get(url) as response: - if response.status == 200: - html = await response.text() - soup = BeautifulSoup(html, 'html.parser') - - # Extract content based on selectors - content = {} - if selectors: - for selector in selectors: - elements = soup.select(selector) - content[selector] = [elem.get_text() for elem in elements] - else: - # Default extraction - content = { - "title": soup.title.string if soup.title else None, - "text": soup.get_text(strip=True), - "links": [a.get('href') for a in soup.find_all('a')] - } - - # Add metadata if requested - if include_metadata: - content["metadata"] = { - "url": url, - "crawled": datetime.now(UTC).isoformat(), - "language": self._detect_language(soup), - "wordCount": len(content["text"].split()), - "linksCount": len(content["links"]) - } - - return self._create_result( - success=True, - data={ - "url": url, - "content": content - } - ) - else: - return self._create_result( - success=False, - data={"error": f"Failed to fetch URL: {response.status}"} - ) - except Exception as e: - logger.error(f"Error extracting content: {e}") - return self._create_result( - success=False, - data={"error": f"Extraction failed: {str(e)}"} - ) - def _get_meta_description(self, soup: BeautifulSoup) -> Optional[str]: """Extract meta description from HTML""" - meta_desc = soup.find('meta', attrs={'name': 'description'}) - if meta_desc: - return meta_desc.get('content') + metaDesc = soup.find('meta', attrs={'name': 'description'}) + if metaDesc: + return metaDesc.get('content') return None def _is_valid_url(self, url: str) -> bool: @@ -319,31 +390,31 @@ class MethodWeb(MethodBase): async def _check_robots_txt(self, url: str) -> bool: """Check if URL is allowed by robots.txt""" try: - parsed_url = urlparse(url) - robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" + parsedUrl = urlparse(url) + robotsUrl = f"{parsedUrl.scheme}://{parsedUrl.netloc}/robots.txt" async with aiohttp.ClientSession() as session: - async with session.get(robots_url, headers={"User-Agent": self.userAgent}, timeout=self.timeout) as response: + async with session.get(robotsUrl, headers={"User-Agent": self.userAgent}, timeout=self.timeout) as response: if response.status == 200: - robots_content = await response.text() + robotsContent = await response.text() # Parse robots.txt content - user_agent = "*" # Default to all user agents - disallow_paths = [] + userAgent = "*" # Default to all user agents + disallowPaths = [] - for line in robots_content.splitlines(): + for line in robotsContent.splitlines(): line = line.strip().lower() if line.startswith("user-agent:"): - user_agent = line[11:].strip() - elif line.startswith("disallow:") and user_agent in ["*", self.userAgent.lower()]: + userAgent = line[11:].strip() + elif line.startswith("disallow:") and userAgent in ["*", self.userAgent.lower()]: path = line[9:].strip() if path: - disallow_paths.append(path) + disallowPaths.append(path) # Check if URL path is disallowed - url_path = parsed_url.path - for disallow_path in disallow_paths: - if url_path.startswith(disallow_path): + urlPath = parsedUrl.path + for disallowPath in disallowPaths: + if urlPath.startswith(disallowPath): return False return True @@ -364,32 +435,32 @@ class MethodWeb(MethodBase): return soup.html.get('lang') # Try to get language from meta tag - meta_lang = soup.find('meta', attrs={'http-equiv': 'content-language'}) - if meta_lang: - return meta_lang.get('content', 'en') + metaLang = soup.find('meta', attrs={'http-equiv': 'content-language'}) + if metaLang: + return metaLang.get('content', 'en') # Try to get language from meta charset - meta_charset = soup.find('meta', attrs={'charset': True}) - if meta_charset: - charset = meta_charset.get('charset', '').lower() + metaCharset = soup.find('meta', attrs={'charset': True}) + if metaCharset: + charset = metaCharset.get('charset', '').lower() if 'utf-8' in charset: return 'en' # Default to English for UTF-8 # Try to detect language from content # This is a simple heuristic based on common words text = soup.get_text().lower() - common_words = { + commonWords = { 'en': ['the', 'and', 'of', 'to', 'in', 'is', 'that', 'for', 'it', 'with'], 'es': ['el', 'la', 'los', 'las', 'de', 'y', 'en', 'que', 'por', 'con'], 'fr': ['le', 'la', 'les', 'de', 'et', 'en', 'que', 'pour', 'avec', 'dans'], 'de': ['der', 'die', 'das', 'und', 'in', 'den', 'von', 'zu', 'für', 'mit'] } - word_counts = {lang: sum(1 for word in words if f' {word} ' in f' {text} ') - for lang, words in common_words.items()} + wordCounts = {lang: sum(1 for word in words if f' {word} ' in f' {text} ') + for lang, words in commonWords.items()} - if word_counts: - return max(word_counts.items(), key=lambda x: x[1])[0] + if wordCounts: + return max(wordCounts.items(), key=lambda x: x[1])[0] return 'en' # Default to English if no language detected diff --git a/modules/neutralizer/neutralizer.py b/modules/neutralizer/neutralizer.py new file mode 100644 index 00000000..18648211 --- /dev/null +++ b/modules/neutralizer/neutralizer.py @@ -0,0 +1,368 @@ +""" +DSGVO-konformer Daten-Neutralisierer für KI-Agentensysteme +Unterstützt TXT, JSON, CSV, Excel und Word-Dateien +Mehrsprachig: DE, EN, FR, IT +""" + +import re +import json +import pandas as pd +import docx +from pathlib import Path +from typing import Dict, List, Tuple, Any, Union, Optional +from dataclasses import dataclass +import uuid +import logging +import traceback +import csv +from datetime import datetime +import xml.etree.ElementTree as ET +import os +import random +from io import StringIO +from patterns import Pattern, HeaderPatterns, DataPatterns, get_pattern_for_header, find_patterns_in_text, TextTablePatterns +import base64 + +# Configure logging +logger = logging.getLogger(__name__) + +@dataclass +class TableData: + """Repräsentiert Tabellendaten""" + headers: List[str] + rows: List[List[str]] + source_type: str # 'csv', 'json', 'xml', 'text_table' + +@dataclass +class PlainText: + """Repräsentiert normalen Text""" + content: str + source_type: str # 'txt', 'docx', 'text_plain' + +@dataclass +class ProcessResult: + """Result of content processing""" + data: Any + mapping: Dict[str, str] + replaced_fields: List[str] + processed_info: Dict[str, Any] # Additional processing information + +class DataAnonymizer: + """Hauptklasse für die Datenanonymisierung""" + + def __init__(self): + """Initialize the anonymizer with patterns""" + self.header_patterns = HeaderPatterns.patterns + self.data_patterns = DataPatterns.patterns + self.replaced_fields = set() + self.mapping = {} + self.processing_info = [] + + def _normalize_whitespace(self, text: str) -> str: + """Normalize whitespace in text""" + text = re.sub(r'\s+', ' ', text) + text = text.replace('\r\n', '\n').replace('\r', '\n') + return text.strip() + + def _is_table_line(self, line: str) -> bool: + """Check if a line represents a table row""" + return bool(re.match(r'^\s*[^:]+:\s*[^:]+$', line) or + re.match(r'^\s*[^\t]+\t[^\t]+$', line)) + + def _extract_tables_from_text(self, content: str) -> Tuple[List[TableData], List[PlainText]]: + """ + Extract tables and plain text from content + + Args: + content: Content to process + + Returns: + Tuple of (list of tables, list of plain text sections) + """ + tables = [] + plain_texts = [] + + # Process the entire content as plain text + plain_texts.append(PlainText(content=content, source_type='text_plain')) + + return tables, plain_texts + + def _anonymize_table(self, table: TableData) -> TableData: + """Anonymize table data""" + try: + anonymized_table = TableData( + headers=table.headers.copy(), + rows=[row.copy() for row in table.rows], + source_type=table.source_type + ) + + for i, header in enumerate(anonymized_table.headers): + pattern = get_pattern_for_header(header, self.header_patterns) + if pattern: + for row in anonymized_table.rows: + if row[i] is not None: + original = str(row[i]) + if original not in self.mapping: + self.mapping[original] = pattern.replacement_template.format(len(self.mapping) + 1) + row[i] = self.mapping[original] + + return anonymized_table + + except Exception as e: + logger.error(f"Error anonymizing table: {str(e)}") + logger.debug(traceback.format_exc()) + raise + + def _anonymize_plain_text(self, text: PlainText) -> PlainText: + """Anonymize plain text content""" + try: + # Process the entire text at once instead of line by line + current_text = text.content + + # Find all matches in the entire text + matches = find_patterns_in_text(current_text, self.data_patterns) + + # Process matches in reverse order to avoid position shifting + for match in sorted(matches, key=lambda x: x[2], reverse=True): + pattern_name, matched_text, start, end = match + + # Skip if the matched text is already a placeholder + if re.match(r'\[[A-Z_]+\d+\]', matched_text): + continue + + # Find the pattern that matched + pattern = next((p for p in self.data_patterns if p.name == pattern_name), None) + if pattern: + # Use the pattern's replacement template + if matched_text not in self.mapping: + self.mapping[matched_text] = pattern.replacement_template.format(len(self.mapping) + 1) + replacement = self.mapping[matched_text] + + if pattern_name == 'email': + print(f"DEBUG: Replacing email '{matched_text}' with '{replacement}'") + print(f"DEBUG: Text after replacement: {current_text[:start] + replacement + current_text[end:]}") + + # Replace the matched text while preserving surrounding whitespace + current_text = current_text[:start] + replacement + current_text[end:] + + return PlainText(content=current_text, source_type=text.source_type) + + except Exception as e: + logger.error(f"Error anonymizing plain text: {str(e)}") + logger.debug(traceback.format_exc()) + raise + + def _anonymize_json_value(self, value: Any, key: str = None) -> Any: + """ + Recursively anonymize JSON values based on their keys and content + + Args: + value: Value to anonymize + key: Key name (if part of a key-value pair) + + Returns: + Anonymized value + """ + if isinstance(value, dict): + return {k: self._anonymize_json_value(v, k) for k, v in value.items()} + elif isinstance(value, list): + return [self._anonymize_json_value(item) for item in value] + elif isinstance(value, str): + # Check if this is a key we should process + if key: + pattern = get_pattern_for_header(key, self.header_patterns) + if pattern: + if value not in self.mapping: + self.mapping[value] = pattern.replacement_template.format(len(self.mapping) + 1) + return self.mapping[value] + + # Check if the value itself matches any patterns + matches = find_patterns_in_text(value, self.data_patterns) + if matches: + # Use the first match's pattern + pattern_name = matches[0][0] + if value not in self.mapping: + self.mapping[value] = f"{pattern_name.upper()}_{len(self.mapping) + 1}" + return self.mapping[value] + + return value + else: + return value + + def _anonymize_xml_element(self, element: ET.Element, indent: str = '') -> str: + """ + Recursively process XML element and return formatted string + + Args: + element: XML element to process + indent: Current indentation level + + Returns: + Formatted XML string + """ + # Process attributes + processed_attrs = {} + for attr_name, attr_value in element.attrib.items(): + # Check if attribute name matches any header patterns + pattern = get_pattern_for_header(attr_name, self.header_patterns) + if pattern: + if attr_value not in self.mapping: + self.mapping[attr_value] = pattern.replacement_template.format(len(self.mapping) + 1) + processed_attrs[attr_name] = self.mapping[attr_value] + else: + # Check if attribute value matches any data patterns + matches = find_patterns_in_text(attr_value, self.data_patterns) + if matches: + pattern_name = matches[0][0] + pattern = next((p for p in self.data_patterns if p.name == pattern_name), None) + if pattern: + if attr_value not in self.mapping: + self.mapping[attr_value] = pattern.replacement_template.format(len(self.mapping) + 1) + processed_attrs[attr_name] = self.mapping[attr_value] + else: + processed_attrs[attr_name] = attr_value + else: + processed_attrs[attr_name] = attr_value + + attrs = ' '.join(f'{k}="{v}"' for k, v in processed_attrs.items()) + attrs = f' {attrs}' if attrs else '' + + # Process text content + text = element.text.strip() if element.text and element.text.strip() else '' + if text: + # Check if text matches any patterns + matches = find_patterns_in_text(text, self.data_patterns) + if matches: + pattern_name = matches[0][0] + pattern = next((p for p in self.data_patterns if p.name == pattern_name), None) + if pattern: + if text not in self.mapping: + self.mapping[text] = pattern.replacement_template.format(len(self.mapping) + 1) + text = self.mapping[text] + + # Process child elements + children = [] + for child in element: + child_str = self._anonymize_xml_element(child, indent + ' ') + children.append(child_str) + + # Build element string + if not children and not text: + return f"{indent}<{element.tag}{attrs}/>" + elif not children: + return f"{indent}<{element.tag}{attrs}>{text}{element.tag}>" + else: + result = [f"{indent}<{element.tag}{attrs}>"] + if text: + result.append(f"{indent} {text}") + result.extend(children) + result.append(f"{indent}{element.tag}>") + return '\n'.join(result) + + def process_content(self, content: str, content_type: str) -> ProcessResult: + """ + Process content and return anonymized data + + Args: + content: Content to process + content_type: Type of content ('csv', 'json', 'xml', 'text') + + Returns: + ProcessResult: Contains anonymized data, mapping, replaced fields and processing info + """ + try: + # Check if content is binary data + is_binary = False + try: + # Try to decode base64 if it's a string + try: + decoded = base64.b64decode(content) + # If it's not valid text, consider it binary + decoded.decode('utf-8') + except (base64.binascii.Error, UnicodeDecodeError): + is_binary = True + except Exception: + is_binary = True + + if is_binary: + # TODO: Implement binary data neutralization + # This would require: + # 1. Detecting binary data types (images, audio, video, etc.) + # 2. Implementing specific neutralization for each type + # 3. Handling metadata and embedded content + # 4. Preserving binary integrity while removing sensitive data + return ProcessResult(content, self.mapping, [], {'type': 'binary', 'status': 'not_implemented'}) + + replaced_fields = [] + processed_info = {} + + if content_type in ['csv', 'json', 'xml']: + # Handle as table + if content_type == 'csv': + df = pd.read_csv(StringIO(content), encoding='utf-8') + table = TableData( + headers=df.columns.tolist(), + rows=df.values.tolist(), + source_type='csv' + ) + processed_info['type'] = 'table' + processed_info['headers'] = table.headers + processed_info['row_count'] = len(table.rows) + elif content_type == 'json': + data = json.loads(content) + # Process JSON recursively + result = self._anonymize_json_value(data) + processed_info['type'] = 'json' + return ProcessResult(result, self.mapping, replaced_fields, processed_info) + else: # xml + root = ET.fromstring(content) + # Process XML recursively with proper formatting + result = self._anonymize_xml_element(root) + processed_info['type'] = 'xml' + return ProcessResult(result, self.mapping, replaced_fields, processed_info) + + if not table.rows: + return ProcessResult(None, self.mapping, [], processed_info) + + anonymized_table = self._anonymize_table(table) + + # Track replaced fields + for i, header in enumerate(anonymized_table.headers): + for orig_row, anon_row in zip(table.rows, anonymized_table.rows): + if anon_row[i] != orig_row[i]: + replaced_fields.append(header) + + # Convert back to original format + if content_type == 'csv': + result = pd.DataFrame(anonymized_table.rows, columns=anonymized_table.headers) + elif content_type == 'json': + if len(anonymized_table.headers) == 1 and anonymized_table.headers[0] == 'value': + result = anonymized_table.rows[0][0] + else: + result = dict(zip(anonymized_table.headers, anonymized_table.rows[0])) + else: # xml + result = ET.tostring(root, encoding='unicode') + + return ProcessResult(result, self.mapping, replaced_fields, processed_info) + else: + # Handle as text + # First, identify what needs to be replaced using table detection + tables, plain_texts = self._extract_tables_from_text(content) + processed_info['type'] = 'text' + processed_info['tables'] = [{'headers': t.headers, 'row_count': len(t.rows)} for t in tables] + + # Process plain text sections + anonymized_texts = [self._anonymize_plain_text(text) for text in plain_texts] + + # Combine all processed content + result = content + for text, anonymized_text in zip(plain_texts, anonymized_texts): + if text.content != anonymized_text.content: + result = result.replace(text.content, anonymized_text.content) + + return ProcessResult(result, self.mapping, replaced_fields, processed_info) + + except Exception as e: + logger.error(f"Error processing content: {str(e)}") + logger.debug(traceback.format_exc()) + return ProcessResult(None, self.mapping, [], {'type': 'error', 'error': str(e)}) \ No newline at end of file diff --git a/modules/neutralizer/patterns.py b/modules/neutralizer/patterns.py new file mode 100644 index 00000000..4cfbed93 --- /dev/null +++ b/modules/neutralizer/patterns.py @@ -0,0 +1,402 @@ +""" +Pattern definitions for data anonymization +Separates header patterns from data patterns +""" + +from dataclasses import dataclass +from typing import List, Optional, Tuple +import re + +@dataclass +class Pattern: + """Base class for patterns""" + name: str + patterns: List[str] + replacement_template: str + +class HeaderPatterns: + """Patterns for identifying sensitive data in headers""" + patterns = [ + # Name patterns + Pattern( + name="name", + patterns=[ + # Simple variations + r'\b(?:name|first[-_\s]*name|last[-_\s]*name|full[-_\s]*name)\b', + r'\b(?:customer[-_\s]*name|client[-_\s]*name|user[-_\s]*name)\b', + r'\b(?:given[-_\s]*name|family[-_\s]*name|surname)\b', + # German variations + r'\b(?:vorname|nachname|vollständiger[-_\s]*name|name)\b', + r'\b(?:kunden[-_\s]*name|kunde[-_\s]*name|benutzer[-_\s]*name)\b', + # French variations + r'\b(?:prénom|nom|nom[-_\s]*complet)\b', + r'\b(?:nom[-_\s]*du[-_\s]*client|nom[-_\s]*d\'utilisateur)\b', + # Italian variations + r'\b(?:nome|cognome|nome[-_\s]*completo)\b', + r'\b(?:nome[-_\s]*cliente|nome[-_\s]*utente)\b', + # Common variations + r'\b(?:nom|name|nome|naam)\b' + ], + replacement_template="[NAME_{}]" + ), + + # Email patterns + Pattern( + name="email", + patterns=[ + # Simple variations - only labels + r'\b(?:email|e[-_\s]*mail|mail)\s*:?\b', + r'\b(?:contact[-_\s]*email|user[-_\s]*email|client[-_\s]*email)\s*:?\b', + r'\b(?:customer[-_\s]*email|customer[-_\s]*mail|customer[-_\s]*e[-_\s]*mail)\s*:?\b', + # German variations - only labels + r'\b(?:e[-_\s]*mail|e[-_\s]*post|mail[-_\s]*adresse)\s*:?\b', + r'\b(?:kontakt[-_\s]*email|benutzer[-_\s]*email|kunden[-_\s]*email)\s*:?\b', + r'\b(?:kunden[-_\s]*mail|kunden[-_\s]*e[-_\s]*mail|kunden[-_\s]*e[-_\s]*post)\s*:?\b', + # French variations - only labels + r'\b(?:courriel|e[-_\s]*mail|adresse[-_\s]*e[-_\s]*mail)\s*:?\b', + r'\b(?:courriel[-_\s]*de[-_\s]*contact|e[-_\s]*mail[-_\s]*client)\s*:?\b', + r'\b(?:courriel[-_\s]*client|courriel[-_\s]*utilisateur|mail[-_\s]*client)\s*:?\b', + # Italian variations - only labels + r'\b(?:posta[-_\s]*elettronica|e[-_\s]*mail|indirizzo[-_\s]*e[-_\s]*mail)\s*:?\b', + r'\b(?:email[-_\s]*cliente|email[-_\s]*utente)\s*:?\b', + r'\b(?:mail[-_\s]*cliente|mail[-_\s]*utente|posta[-_\s]*cliente)\s*:?\b' + ], + replacement_template="[EMAIL_{}]" + ), + + # Phone patterns + Pattern( + name="phone", + patterns=[ + # Simple variations + r'\b(?:phone|tel|telephone|mobile)\b', + r'\b(?:contact[-_\s]*number|phone[-_\s]*number|tel[-_\s]*number)\b', + # German variations + r'\b(?:telefon|mobil|handy|telefon[-_\s]*nummer)\b', + r'\b(?:kontakt[-_\s]*nummer|telefon[-_\s]*nummer|tel[-_\s]*nummer)\b', + # French variations + r'\b(?:téléphone|portable|mobile|numéro[-_\s]*de[-_\s]*téléphone)\b', + r'\b(?:numéro[-_\s]*de[-_\s]*contact|tél[-_\s]*fixe|tél[-_\s]*mobile)\b', + # Italian variations + r'\b(?:telefono|cellulare|mobile|numero[-_\s]*di[-_\s]*telefono)\b', + r'\b(?:numero[-_\s]*di[-_\s]*contatto|tel[-_\s]*fisso|tel[-_\s]*mobile)\b' + ], + replacement_template="[PHONE_{}]" + ), + + # IBAN patterns + Pattern( + name="iban", + patterns=[ + # Simple variations + r'\b(?:iban|bank[-_\s]*account|account[-_\s]*number)\b', + r'\b(?:bank[-_\s]*details|account[-_\s]*details|banking[-_\s]*info)\b', + # German variations + r'\b(?:iban|bank[-_\s]*konto|konto[-_\s]*nummer)\b', + r'\b(?:bank[-_\s]*verbindung|konto[-_\s]*verbindung|bank[-_\s]*daten)\b', + # French variations + r'\b(?:iban|compte[-_\s]*bancaire|numéro[-_\s]*de[-_\s]*compte)\b', + r'\b(?:coordonnées[-_\s]*bancaires|détails[-_\s]*bancaires)\b', + # Credit card variations in French + r'\b(?:carte[-_\s]*de[-_\s]*credit|carte[-_\s]*credit|numero[-_\s]*carte[-_\s]*credit)\b', + r'\b(?:carte[-_\s]*bancaire|carte[-_\s]*de[-_\s]*paiement)\b', + r'\b(?:carte[-_\s]*de[-_\s]*crédit|carte[-_\s]*crédit|numéro[-_\s]*carte[-_\s]*crédit)\b', + r'\b(?:carte[-_\s]*de[-_\s]*débit|carte[-_\s]*débit|numéro[-_\s]*carte[-_\s]*débit)\b', + # Italian variations + r'\b(?:iban|conto[-_\s]*bancario|numero[-_\s]*di[-_\s]*conto)\b', + r'\b(?:coordinate[-_\s]*bancarie|dettagli[-_\s]*bancari)\b', + # Common variations + r'\b(?:bankkonto|bank[-_\s]*konto|conto[-_\s]*di[-_\s]*banca)\b', + # Credit card variations + r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number|credit[-_\s]*card[-_\s]*no)\b', + r'\b(?:credit[-_\s]*card[-_\s]*nr|credit[-_\s]*card[-_\s]*num)\b', + r'\b(?:credit[-_\s]*card[-_\s]*id|credit[-_\s]*card[-_\s]*code)\b', + r'\b(?:credit[-_\s]*card[-_\s]*reference|credit[-_\s]*card[-_\s]*ref)\b', + r'\b(?:credit[-_\s]*card[-_\s]*details|credit[-_\s]*card[-_\s]*info)\b', + r'\b(?:credit[-_\s]*card[-_\s]*data|credit[-_\s]*card[-_\s]*account)\b', + # Credit card variations in other languages + r'\b(?:kredit[-_\s]*karte|kreditkarte|kredit[-_\s]*karten[-_\s]*nummer)\b', + r'\b(?:carta[-_\s]*di[-_\s]*credito|carta[-_\s]*credito|numero[-_\s]*carta[-_\s]*credito)\b', + # Payment variations + r'\b(?:payment[-_\s]*details|payment[-_\s]*info|payment[-_\s]*data)\b', + r'\b(?:zahlungs[-_\s]*details|zahlungs[-_\s]*informationen|zahlungs[-_\s]*daten)\b', + r'\b(?:détails[-_\s]*de[-_\s]*paiement|informations[-_\s]*de[-_\s]*paiement)\b', + r'\b(?:dettagli[-_\s]*di[-_\s]*pagamento|informazioni[-_\s]*di[-_\s]*pagamento)\b', + # Common credit card abbreviations + r'\b(?:cc[-_\s]*number|cc[-_\s]*no|cc[-_\s]*nr)\b', + r'\b(?:cc[-_\s]*num|cc[-_\s]*id|cc[-_\s]*code)\b', + r'\b(?:cc[-_\s]*ref|cc[-_\s]*details|cc[-_\s]*info)\b', + r'\b(?:cc[-_\s]*data|cc[-_\s]*account)\b', + # Simple credit card + r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number)\b', + # Additional credit card variations + r'\b(?:card[-_\s]*number|card[-_\s]*no|card[-_\s]*nr)\b', + r'\b(?:card[-_\s]*num|card[-_\s]*id|card[-_\s]*code)\b', + r'\b(?:card[-_\s]*ref|card[-_\s]*details|card[-_\s]*info)\b', + r'\b(?:card[-_\s]*data|card[-_\s]*account)\b' + ], + replacement_template="[IBAN_{}]" + ), + + # Address patterns + Pattern( + name="address", + patterns=[ + # English variations + r'\b(?:address|street[-_\s]*address|mailing[-_\s]*address)\b', + r'\b(?:home[-_\s]*address|work[-_\s]*address|billing[-_\s]*address)\b', + r'\b(?:.*address.*)\b', # Match any text containing "address" + # German variations + r'\b(?:adresse|strassen[-_\s]*adresse|post[-_\s]*adresse)\b', + r'\b(?:wohn[-_\s]*adresse|geschäfts[-_\s]*adresse|rechnungs[-_\s]*adresse)\b', + r'\b(?:.*adresse.*)\b', # Match any text containing "adresse" + # French variations + r'\b(?:adresse|adresse[-_\s]*postale|adresse[-_\s]*de[-_\s]*livraison)\b', + r'\b(?:adresse[-_\s]*personnelle|adresse[-_\s]*professionnelle)\b', + r'\b(?:.*adresse.*)\b', # Match any text containing "adresse" + # Italian variations + r'\b(?:indirizzo|indirizzo[-_\s]*postale|indirizzo[-_\s]*di[-_\s]*consegna)\b', + r'\b(?:indirizzo[-_\s]*personale|indirizzo[-_\s]*professionale)\b', + r'\b(?:.*indirizzo.*)\b', # Match any text containing "indirizzo" + # Common variations + r'\b(?:location|place|residence|domicile)\b', + r'\b(?:standort|ort|wohnort|domizil)\b', + r'\b(?:lieu|emplacement|résidence|domicile)\b', + r'\b(?:luogo|posizione|residenza|domicilio)\b' + ], + replacement_template="[ADDRESS_{}]" + ), + + # Date patterns + Pattern( + name="date", + patterns=[ + # English variations + r'\b(?:date|birth[-_\s]*date|date[-_\s]*of[-_\s]*birth)\b', + r'\b(?:dob|birthday|anniversary)\b', + # German variations + r'\b(?:datum|geburt[-_\s]*datum|geboren[-_\s]*am)\b', + r'\b(?:geburtstag|jubiläum|feier[-_\s]*tag)\b', + r'\b(?:geboren|geb\.|geboren[-_\s]*am)\b', + # French variations + r'\b(?:date|date[-_\s]*de[-_\s]*naissance|né[-_\s]*le)\b', + r'\b(?:anniversaire|date[-_\s]*anniversaire)\b', + r'\b(?:né|née|né[-_\s]*le)\b', + # Italian variations + r'\b(?:data|data[-_\s]*di[-_\s]*nascita|nato[-_\s]*il)\b', + r'\b(?:compleanno|anniversario)\b', + r'\b(?:nato|nata|nato[-_\s]*il)\b', + # Common variations + r'\b(?:birth|born|geboren|né|nato)\b' + ], + replacement_template="[DATE_{}]" + ), + + # SSN patterns + Pattern( + name="ssn", + patterns=[ + # English variations + r'\b(?:ssn|social[-_\s]*security[-_\s]*number|tax[-_\s]*id)\b', + r'\b(?:tax[-_\s]*identification|national[-_\s]*id)\b', + # German variations + r'\b(?:ahv[-_\s]*nummer|sozial[-_\s]*versicherungs[-_\s]*nummer)\b', + r'\b(?:steuer[-_\s]*nummer|steuer[-_\s]*id|svn)\b', + r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b', + # French variations + r'\b(?:numéro[-_\s]*avs|numéro[-_\s]*de[-_\s]*sécurité[-_\s]*sociale)\b', + r'\b(?:numéro[-_\s]*fiscal|numéro[-_\s]*d\'identification)\b', + # Italian variations + r'\b(?:numero[-_\s]*avs|numero[-_\s]*di[-_\s]*sicurezza[-_\s]*sociale)\b', + r'\b(?:numero[-_\s]*fiscale|codice[-_\s]*fiscale)\b', + # Common variations + r'\b(?:ahv|svn|nss|avs)\b', + # Additional AHV variations + r'\b(?:ahv_nummer|ahvnummer|ahv-nummer|ahv_number)\b', + r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b', + r'\b(?:ahv[-_\s]*number|ahv[-_\s]*number)\b', + r'\b(?:ahv[-_\s]*id|ahv[-_\s]*id)\b', + r'\b(?:ahv[-_\s]*code|ahv[-_\s]*code)\b', + r'\b(?:ahv[-_\s]*reference|ahv[-_\s]*reference)\b', + r'\b(?:ahv[-_\s]*reference[-_\s]*number|ahv[-_\s]*reference[-_\s]*number)\b', + r'\b(?:ahv[-_\s]*reference[-_\s]*no|ahv[-_\s]*reference[-_\s]*no)\b', + r'\b(?:ahv[-_\s]*reference[-_\s]*nr|ahv[-_\s]*reference[-_\s]*nr)\b', + r'\b(?:ahv[-_\s]*reference[-_\s]*num|ahv[-_\s]*reference[-_\s]*num)\b', + r'\b(?:ahv[-_\s]*reference[-_\s]*id|ahv[-_\s]*reference[-_\s]*id)\b', + r'\b(?:ahv[-_\s]*reference[-_\s]*code|ahv[-_\s]*reference[-_\s]*code)\b' + ], + replacement_template="[SSN_{}]" + ) + ] + +class DataPatterns: + """Patterns for identifying sensitive data in content""" + patterns = [ + # Name patterns + Pattern( + name="name", + patterns=[ + # Person names with titles and academic degrees + r'\b(?:Dr\.|Prof\.|PhD\.?|MD\.?|Herr|Frau|Mr\.|Mrs\.|Ms\.|Monsieur|Madame|Signore|Signora)\s+[A-Z][a-z]{2,}(?:\s+[A-Za-z]{2,}){1,2}\b' + ], + replacement_template="[NAME_{}]" + ), + + # Email pattern for plain text + Pattern( + name="email", + patterns=[ + # Basic email pattern + r'[A-Za-z0-9._%+-]+@[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)*' + ], + replacement_template="[EMAIL_{}]" + ), + + # Phone patterns + Pattern( + name="phone", + patterns=[ + # International format + r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b', + # Swiss format + r'\b(?:0\d{1,2}|0041\d{1,2})[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b', + # German format + r'\b(?:0\d{1,4}|0049\d{1,4})[-.\s]?\d{3,}[-.\s]?\d{3,}\b', + # French format + r'\b(?:0\d{1,2}|0033\d{1,2})[-.\s]?\d{1,2}[-.\s]?\d{2}[-.\s]?\d{2}[-.\s]?\d{2}\b', + # Italian format + r'\b(?:0\d{1,3}|0039\d{1,3})[-.\s]?\d{3,}[-.\s]?\d{3,}\b', + # Mobile numbers + r'\b(?:07|00417|004917|00337|00397)\d{8,9}\b', + # Emergency numbers + r'\b(?:112|911|118|117|144|1414)\b' + ], + replacement_template="[PHONE_{}]" + ), + + # IBAN patterns + Pattern( + name="iban", + patterns=[ + r'\b(?:CH|DE|FR|IT)\d{2}\s?(?:\d{4}\s?){5}\d{2}\b', + r'\b(?:CH|DE|FR|IT)\d{2}(?:\d{4}){5}\d{2}\b' + ], + replacement_template="[IBAN_{}]" + ), + + # Address patterns + Pattern( + name="address", + patterns=[ + r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:[a-z])?\b', + r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b' + ], + replacement_template="[ADDRESS_{}]" + ), + + # Date patterns + Pattern( + name="date", + patterns=[ + # Specific date formats with context + r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', # Birth dates + r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # Birth dates + r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', # Contract dates + r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # Contract dates + # Specific date formats with month names + r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b', # Birth dates with month + r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b' # Contract dates with month + ], + replacement_template="[DATE_{}]" + ), + + # SSN patterns + Pattern( + name="ssn", + patterns=[ + r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b', # Swiss AHV + r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs + r'\b\d{3}\.\d{3}\.\d{3}\b' # Generic SSN format + ], + replacement_template="[SSN_{}]" + ) + ] + +class TextTablePatterns: + """Patterns for identifying table-like structures in text""" + + @staticmethod + def get_patterns() -> List[Tuple[str, str]]: + return [ + # key: value pattern (with optional whitespace) + (r'^([^:]+):\s*(.+)$', ':'), + # key = value pattern (with optional whitespace) + (r'^([^=]+)=\s*(.+)$', '='), + # key = value pattern (with required whitespace) + (r'^([^=]+)\s+=\s+(.+)$', '='), + # key: value pattern (with required whitespace) + (r'^([^:]+)\s+:\s+(.+)$', ':'), + ] + + @staticmethod + def is_table_line(line: str) -> bool: + """Check if a line matches any table pattern""" + patterns = TextTablePatterns.get_patterns() + return any(re.match(pattern[0], line.strip()) for pattern in patterns) + + @staticmethod + def extract_key_value(line: str) -> Optional[Tuple[str, str]]: + """Extract key and value from a table line""" + patterns = TextTablePatterns.get_patterns() + for pattern, separator in patterns: + match = re.match(pattern, line.strip()) + if match: + key = match.group(1).strip() + value = match.group(2).strip() + return key, value + return None + +def get_pattern_for_header(header: str, patterns: List[Pattern]) -> Optional[Pattern]: + """ + Find matching pattern for a header + + Args: + header: The header to check + patterns: List of patterns to check against + + Returns: + Optional[Pattern]: Matching pattern or None + """ + if not header: + return None + + header = header.lower().strip() + + for pattern in patterns: + for p in pattern.patterns: + if re.search(p, header, re.IGNORECASE): + return pattern + return None + +def find_patterns_in_text(text: str, patterns: List[Pattern]) -> List[tuple]: + """ + Find all pattern matches in text + + Args: + text: Text to search + patterns: List of patterns to check + + Returns: + List[tuple]: List of (pattern_name, match, start, end) + """ + matches = [] + for pattern in patterns: + for p in pattern.patterns: + if pattern.name == 'email': + print(f"\nDEBUG: Checking email pattern '{p}'") + for match in re.finditer(p, text, re.IGNORECASE): + if pattern.name == 'email': + print(f"DEBUG: Found email match: '{match.group(0)}' at position {match.start()}-{match.end()}") + print(f"DEBUG: Context: '{text[max(0, match.start()-20):match.end()+20]}'") + matches.append((pattern.name, match.group(0), match.start(), match.end())) + return sorted(matches, key=lambda x: x[2]) # Sort by start position \ No newline at end of file diff --git a/modules/shared/mimeUtils.py b/modules/shared/mimeUtils.py deleted file mode 100644 index 9c70cd60..00000000 --- a/modules/shared/mimeUtils.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Utility functions for MIME type handling and file format determination. -""" - -def isTextMimeType(mimeType: str) -> bool: - """ - Determines if a MIME type represents a text format that should not be base64 encoded. - - Args: - mimeType: The MIME type to check - - Returns: - True if the content is a text format, False otherwise - """ - return ( - mimeType.startswith("text/") or - mimeType in [ - "application/json", - "application/xml", - "application/javascript", - "application/x-python", - "image/svg+xml" - ] - ) - -def determineContentEncoding(fileName: str, content: any, mimeType: str = None) -> bool: - """ - Determines if content should be base64 encoded based on file type and MIME type. - - Args: - fileName: Name of the file including extension - content: The content of the file - mimeType: Optional MIME type of the content - - Returns: - True if content should be base64 encoded, False otherwise - """ - # If MIME type is provided, use it for determination - if mimeType: - if isTextMimeType(mimeType): - return False if isinstance(content, str) else True - - # Import here to avoid circular imports - import os - - # Extract file extension - _, extension = os.path.splitext(fileName) - extension = extension.lower().lstrip('.') - - # Determine if we should base64 encode based on file type - text_extensions = {'txt', 'csv', 'json', 'xml', 'html', 'md', 'svg', 'js', 'css', 'py'} - - # If it's a text format and content is a string, don't base64 encode - if extension in text_extensions and isinstance(content, str): - return False - - # For binary formats, always base64 encode - binary_extensions = {'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip', 'rar'} - if extension in binary_extensions: - return True - - # If content is bytes, base64 encode regardless of extension - if isinstance(content, bytes): - return True - - # Default for unknown types - return not isinstance(content, str) \ No newline at end of file diff --git a/modules/workflow/documentService.py b/modules/workflow/documentService.py new file mode 100644 index 00000000..79e215c9 --- /dev/null +++ b/modules/workflow/documentService.py @@ -0,0 +1,106 @@ +""" +Document Manager Module for handling document operations and content extraction. +""" + +import base64 +import logging +from typing import List, Optional, Dict, Any, Union +from pathlib import Path +import uuid + +from modules.interfaces.serviceChatModel import ( + ChatDocument, + TaskDocument, + ExtractedContent, + ContentItem, + ContentMetadata +) +from modules.workflow.serviceContainer import ServiceContainer +from modules.workflow.processorDocument import DocumentProcessor + +logger = logging.getLogger(__name__) + +class DocumentManager: + """Manager for document operations and content extraction""" + + def __init__(self, serviceContainer: ServiceContainer): + self.service = serviceContainer + self._processor = DocumentProcessor() + + async def extractFromChatDocument(self, prompt: str, document: ChatDocument) -> ExtractedContent: + """ + Extract content from a ChatDocument with AI processing. + + Args: + prompt: Prompt for AI content extraction + document: The ChatDocument to process + + Returns: + ExtractedContent containing the processed content + """ + # Convert ChatDocument to TaskDocument + taskDoc = await self._convertToTaskDocument(document) + + # Process document using processor + extractedContent = await self._processor.processDocument(taskDoc, prompt) + + # Update the objectId and objectType to reference the original ChatDocument + extractedContent.objectId = document.id + extractedContent.objectType = "ChatDocument" + + return extractedContent + + async def extractFromTaskDocument(self, prompt: str, document: TaskDocument) -> ExtractedContent: + """ + Extract content directly from a task document. + + Args: + prompt: The prompt to use for content extraction + document: The task document to extract content from + + Returns: + ExtractedContent containing the processed content + + Raises: + ValueError: If document is invalid + IOError: If file cannot be read + """ + try: + return await self._processor.processDocument(document, prompt) + except Exception as e: + logger.error(f"Error extracting from task document: {str(e)}") + raise + + async def _convertToTaskDocument(self, chatDoc: ChatDocument) -> TaskDocument: + """ + Convert a ChatDocument to a TaskDocument. + + Args: + chatDoc: The chat document to convert + + Returns: + TaskDocument containing the converted data + + Raises: + ValueError: If document is invalid + IOError: If file cannot be read + """ + try: + # Get file content + fileContent = await self.service.functions.getFileData(chatDoc.fileId) + if not fileContent: + raise ValueError(f"Could not get content for file {chatDoc.fileId}") + + # Convert to base64 + base64Data = base64.b64encode(fileContent).decode('utf-8') + + return TaskDocument( + id=str(uuid.uuid4()), + filename=chatDoc.filename, + fileSize=chatDoc.fileSize, + mimeType=chatDoc.mimeType, + data=base64Data + ) + except Exception as e: + logger.error(f"Error converting chat document to task document: {str(e)}") + raise diff --git a/modules/workflow/managerChat.py b/modules/workflow/managerChat.py index 9518ef2e..1620d76e 100644 --- a/modules/workflow/managerChat.py +++ b/modules/workflow/managerChat.py @@ -2,15 +2,18 @@ import logging import importlib import pkgutil import inspect -from typing import Dict, Any, Optional, List, Type +from typing import Dict, Any, Optional, List, Type, Callable, Awaitable from datetime import datetime, UTC import json import asyncio +import base64 from modules.methods.methodBase import MethodBase, AuthSource, MethodResult from modules.workflow.serviceContainer import ServiceContainer -from modules.interfaces.serviceChatModel import AgentTask, AgentAction, AgentResult, Action, TaskStatus -from modules.workflow.managerPrompt import AIPromptManager +from modules.interfaces.serviceChatModel import ( + AgentTask, AgentAction, AgentResult, Action, TaskStatus, ChatWorkflow, + ChatMessage, ChatDocument, ChatStat, ExtractedContent, ContentItem +) from modules.workflow.processorDocument import DocumentProcessor from modules.shared.configuration import APP_CONFIG @@ -21,33 +24,34 @@ class ChatManager: def __init__(self): self.service = ServiceContainer() - self._discover_methods() - self.workflow = None - self.current_task = None - self.workflow_history = [] + self._discoverMethods() + self.workflow: Optional[ChatWorkflow] = None + self.currentTask: Optional[AgentTask] = None + self.workflowHistory: List[ChatMessage] = [] + self.documentProcessor = DocumentProcessor() - def _discover_methods(self): + def _discoverMethods(self): """Dynamically discover all method classes in modules.methods package""" try: # Import the methods package - methods_package = importlib.import_module('modules.methods') + methodsPackage = importlib.import_module('modules.methods') # Discover all modules in the package - for _, name, is_pkg in pkgutil.iter_modules(methods_package.__path__): - if not is_pkg and name.startswith('method'): + for _, name, isPkg in pkgutil.iter_modules(methodsPackage.__path__): + if not isPkg and name.startswith('method'): try: # Import the module module = importlib.import_module(f'modules.methods.{name}') # Find all classes in the module that inherit from MethodBase - for item_name, item in inspect.getmembers(module): + for itemName, item in inspect.getmembers(module): if (inspect.isclass(item) and issubclass(item, MethodBase) and item != MethodBase): # Instantiate the method and add to service - method_instance = item() - self.service.methods[method_instance.name] = method_instance - logger.info(f"Discovered method: {method_instance.name}") + methodInstance = item() + self.service.methods[methodInstance.name] = methodInstance + logger.info(f"Discovered method: {methodInstance.name}") except Exception as e: logger.error(f"Error loading method module {name}: {str(e)}") @@ -55,37 +59,58 @@ class ChatManager: except Exception as e: logger.error(f"Error discovering methods: {str(e)}") - async def initialize(self, workflow: Any, context: Dict[str, Any]) -> None: - """Initialize chat manager with workflow and context""" + async def initialize(self, workflow: ChatWorkflow) -> None: + """Initialize chat manager with workflow""" self.service.workflow = workflow - self.service.context = context # Initialize AI model self.service.model = { - 'callAiBasic': self._call_ai_basic, - 'callAiAdvanced': self._call_ai_advanced + 'callAiBasic': self._callAiBasic, + 'callAiAdvanced': self._callAiAdvanced } # Initialize document processor - self.service.document_processor.initialize(context) + self.service.documentProcessor.initialize() - async def create_initial_task(self, user_input: Dict[str, Any]) -> AgentTask: + def _generatePrompt(self, task: str, document: ChatDocument, examples: List[Dict[str, str]] = None) -> str: + """Generate a prompt based on task and document""" + try: + # Create base prompt + prompt = f"""Task: {task} +Document: {document.filename} ({document.mimeType}) + +""" + + # Add examples if provided + if examples: + prompt += "\nExamples:\n" + for example in examples: + prompt += f"Input: {example.get('input', '')}\n" + prompt += f"Output: {example.get('output', '')}\n\n" + + return prompt + + except Exception as e: + logger.error(f"Error generating prompt: {str(e)}") + return "" + + async def createInitialTask(self, userInput: Dict[str, Any]) -> AgentTask: """Create initial task from user input""" # Get available methods and their actions - method_catalog = self.service.get_available_methods() + methodCatalog = self.service.getAvailableMethods() # Process user input with AI - processed_input = await self._process_user_input(user_input, method_catalog) + processedInput = await self._processUserInput(userInput, methodCatalog) # Create actions from processed input - actions = await self._create_actions(processed_input['actions']) + actions = await self._createActions(processedInput['actions']) # Create task task = AgentTask( id=f"task_{datetime.now(UTC).timestamp()}", workflowId=self.workflow.id, - userInput=processed_input['objective'], - dataList=user_input.get('connections', []), + userInput=processedInput['objective'], + dataList=userInput.get('connections', []), actionList=actions, status=TaskStatus.PENDING, createdAt=datetime.now(UTC), @@ -96,15 +121,15 @@ class ChatManager: self.service.tasks['current'] = task return task - async def execute_current_task(self) -> None: + async def executeCurrentTask(self) -> None: """Execute current task""" task = self.service.tasks.get('current') if not task: raise ValueError("No current task to execute") - await self.service.execute_task(task) + await self.service.executeTask(task) - async def define_next_task(self) -> Optional[AgentTask]: + async def defineNextTask(self) -> Optional[AgentTask]: """Define next task based on current task results""" current_task = self.service.tasks.get('current') if not current_task: @@ -112,7 +137,7 @@ class ChatManager: try: # Analyze task results - analysis = await self._analyze_task_results(current_task) + analysis = await self._analyzeTaskResults(current_task) # If workflow is complete, update task status if analysis['isComplete']: @@ -122,7 +147,7 @@ class ChatManager: # If more actions needed, create next task if not analysis['isComplete']: - next_task = self._create_next_task(current_task, analysis) + next_task = self._createNextTask(current_task, analysis) self.service.tasks['previous'] = current_task self.service.tasks['current'] = next_task return next_task @@ -133,15 +158,15 @@ class ChatManager: current_task.updatedAt = datetime.now(UTC) return None - async def _process_user_input(self, user_input: Dict[str, Any], method_catalog: Dict[str, Any]) -> Dict[str, Any]: + async def _processUserInput(self, userInput: Dict[str, Any], methodCatalog: Dict[str, Any]) -> Dict[str, Any]: """Process user input with AI to extract objectives and actions""" # Create prompt with available methods and actions prompt = f"""Given the following user input and available methods/actions, extract the objective and required actions: -User Input: {user_input.get('message', '')} +User Input: {userInput.get('message', '')} Available Methods and Actions: -{json.dumps(method_catalog, indent=2)} +{json.dumps(methodCatalog, indent=2)} Please provide a JSON response with: 1. objective: The main goal or task to accomplish @@ -164,22 +189,22 @@ Example format: """ # Call AI service - response = await self.service.model['callAiBasic'](prompt) + response = await self._callAiBasic(prompt) return json.loads(response) - async def _create_actions(self, actions_data: List[Dict[str, Any]]) -> List[AgentAction]: + async def _createActions(self, actionsData: List[Dict[str, Any]]) -> List[AgentAction]: """Create action objects from processed input""" actions = [] - for action_data in actions_data: - method = self.service.get_method(action_data['method']) + for actionData in actionsData: + method = self.service.getMethod(actionData['method']) if not method: continue action = AgentAction( id=f"action_{datetime.now(UTC).timestamp()}", - method=action_data['method'], - action=action_data['action'], - parameters=action_data.get('parameters', {}), + method=actionData['method'], + action=actionData['action'], + parameters=actionData.get('parameters', {}), status=TaskStatus.PENDING, createdAt=datetime.now(UTC), updatedAt=datetime.now(UTC) @@ -188,7 +213,7 @@ Example format: return actions - async def _summarize_workflow(self) -> str: + async def _summarizeWorkflow(self) -> str: """Summarize workflow history""" if not self.workflow.messages: return "" @@ -203,12 +228,12 @@ Example format: 4. Any issues or blockers """ - return await self.service.model['callAiBasic'](prompt) + return await self._callAiBasic(prompt) - async def _analyze_task_results(self, task: AgentTask) -> Dict[str, Any]: + async def _analyzeTaskResults(self, task: AgentTask) -> Dict[str, Any]: """Analyze task results to determine next steps""" # Get workflow summary - summary = await self._summarize_workflow() + summary = await self._summarizeWorkflow() # Create prompt for analysis prompt = f"""Analyze the following task results and workflow history to determine next steps: @@ -240,10 +265,10 @@ Example format: }} """ - response = await self.service.model['callAiBasic'](prompt) + response = await self._callAiBasic(prompt) return json.loads(response) - def _create_next_task(self, current_task: AgentTask, analysis: Dict[str, Any]) -> AgentTask: + def _createNextTask(self, current_task: AgentTask, analysis: Dict[str, Any]) -> AgentTask: """Create next task based on analysis""" # Create actions for next task actions = [] @@ -271,20 +296,20 @@ Example format: updatedAt=datetime.now(UTC) ) - async def process_task(self, task: Any) -> Dict[str, Any]: + async def processTask(self, task: AgentTask) -> Dict[str, Any]: """Process a task with improved error handling and AI integration""" try: # Execute task - await self.service.execute_task(task) + await self.service.executeTask(task) # Process results - if task.status == 'success': + if task.status == TaskStatus.COMPLETED: # Generate feedback using AI - feedback = await self._process_task_results(task) + feedback = await self._processTaskResults(task) task.thisTaskFeedback = feedback # Create output documents - documents = await self._create_output_documents(task) + documents = await self._createOutputDocuments(task) task.documentsOutput = documents return { @@ -307,89 +332,168 @@ Example format: "feedback": f"Error processing task: {str(e)}" } - async def _process_task_results(self, task: Any) -> str: - """Process task results and generate feedback using AI""" + def _generateDocumentPrompt(self, task: str) -> str: + """Generate a prompt for document generation""" + return f"""Generate output documents for the following task: + +Task: {task} + +For each document you need to generate, provide a TaskDocument object with the following structure: +{{ + "filename": "string", # Filename with extension + "mimeType": "string", # MIME type of the file + "data": "string", # File content as text or base64 + "base64Encoded": boolean # True if data is base64 encoded +}} + +Rules: +1. For text files (txt, json, xml, etc.), provide content directly in the data field +2. For binary files (images, videos, etc.), encode content in base64 and set base64Encoded to true +3. Use appropriate MIME types (e.g., text/plain, image/jpeg, application/pdf) +4. Include file extensions in filenames + +Return a JSON array of TaskDocument objects. +""" + + async def _processTaskResults(self, task: AgentTask) -> str: + """Process task results and generate feedback""" try: - # Create context for AI - context = { - "task": "Process task results", - "document": {"name": "Task Results", "type": "json"} - } + # Generate document prompt + docPrompt = self._generateDocumentPrompt(task.userInput) - # Generate prompt - prompt = self.service.prompt_manager.generate_prompt( - context, - [ - {"input": "Task results", "output": "Generate summary"} - ] - ) + # Get AI response for document generation + docResponse = await self._callAiBasic(docPrompt) - # Call AI - response = await self.service.model['callAiBasic']( - f"""Process task results and generate feedback: - Task Input: {task.userInput} - Method Results: {task.result} - Generated Documents: {task.documentsOutput} + # Parse response into TaskDocument objects + try: + taskDocs = json.loads(docResponse) + task.documentsOutput = taskDocs + except json.JSONDecodeError as e: + logger.error(f"Error parsing document response: {str(e)}") + return f"Error processing results: {str(e)}" + + # Generate feedback + feedback = await self._callAiBasic( + f"""Generate feedback for the completed task: + Task: {task.userInput} + Generated Documents: {len(task.documentsOutput)} files - {prompt} - - Please provide: - 1. Summary of completed actions - 2. Generated document descriptions - 3. Next steps or completion status - - Format your response as JSON: - {{ - "summary": "string", - "documents": ["string"], - "nextSteps": ["string"] - }} + Provide a concise summary of what was accomplished. """ ) - # Parse and validate response - try: - result = json.loads(response) - return result.get("summary", "Task completed successfully") - except json.JSONDecodeError: - return response.strip() - + return feedback + except Exception as e: logger.error(f"Error processing task results: {str(e)}") return f"Error processing results: {str(e)}" - async def _create_output_documents(self, task: Any) -> List[Dict[str, Any]]: + async def _createOutputDocuments(self, task: AgentTask) -> List[ChatDocument]: """Create output documents from task results""" try: - documents = [] + fileIds = [] - # Process each document - for doc in task.documentsOutput: - processed = self.service.document_processor.process_with_context( - doc, - { - "id": doc.get("id", ""), - "extractionHistory": doc.get("extractionHistory", []), - "relevantSections": doc.get("relevantSections", []), - "processingStatus": doc.get("processingStatus", {}) - } + # Process each TaskDocument from AI output + for taskDoc in task.documentsOutput: + # Store file in database + fileItem = self.service.functions.createFile( + name=taskDoc.filename, + mimeType=taskDoc.mimeType ) - if processed: - documents.append(processed) + # Store file content + if taskDoc.base64Encoded: + # Decode base64 content + content = base64.b64decode(taskDoc.data) + else: + # Use text content directly + content = taskDoc.data.encode('utf-8') + + # Store file data + self.service.functions.createFileData(fileItem.id, content) + fileIds.append(fileItem.id) - return documents + # Convert all files to ChatDocuments in one call + if fileIds: + return await self.service.chat.processFileIds(fileIds) + return [] except Exception as e: logger.error(f"Error creating output documents: {str(e)}") return [] - async def _call_ai_basic(self, prompt: str) -> str: - """Call basic AI model""" - # TODO: Implement actual AI call - return "AI response placeholder" + async def _callAiBasic(self, prompt: str) -> str: + """Call basic AI service""" + try: + if not self.service or not self.service.base: + raise ValueError("Service or base interface not initialized") + return await self.service.base.callAi([ + {"role": "system", "content": "You are an AI assistant that helps process user requests."}, + {"role": "user", "content": prompt} + ]) + except Exception as e: + logger.error(f"Error calling AI service: {str(e)}") + raise - async def _call_ai_advanced(self, prompt: str, context: Dict[str, Any]) -> str: + async def _callAiAdvanced(self, prompt: str, context: Dict[str, Any]) -> str: """Call advanced AI model with context""" # TODO: Implement actual AI call - return "AI response placeholder" \ No newline at end of file + return "AI response placeholder" + + async def generateWorkflowFeedback(self, workflow: ChatWorkflow) -> str: + """ + Generates a final feedback message for the workflow in the user's language. + + Args: + workflow: The completed workflow to generate feedback for + + Returns: + str: The generated feedback message + """ + try: + # Get workflow summary + workflowSummary = { + "status": workflow.status, + "totalMessages": len(workflow.messages), + "totalDocuments": sum(len(msg.documents) for msg in workflow.messages), + "duration": (datetime.now(UTC) - datetime.fromisoformat(workflow.startedAt)).total_seconds() + } + + # Get user language from workflow mandate + userLanguage = workflow.mandateId.split('_')[0] if workflow.mandateId else 'en' + + # Prepare messages for AI context + messages = [ + { + "role": "system", + "content": f"You are an AI assistant providing a summary of a completed workflow. " + f"Please respond in '{userLanguage}' language. " + f"Summarize the workflow's activities, outcomes, and any important points. " + f"Be concise but informative. Use a professional but friendly tone." + }, + { + "role": "user", + "content": f"Please provide a summary of this workflow:\n" + f"Status: {workflowSummary['status']}\n" + f"Total Messages: {workflowSummary['totalMessages']}\n" + f"Total Documents: {workflowSummary['totalDocuments']}\n" + f"Duration: {workflowSummary['duration']:.1f} seconds" + } + ] + + # Add relevant workflow messages for context + for msg in workflow.messages: + if msg.role == "user" or msg.status in ["first", "last"]: + messages.append({ + "role": msg.role, + "content": msg.message + }) + + # Generate feedback using AI + feedback = await self.service.aiService.callApi(messages, temperature=0.7) + + return feedback + + except Exception as e: + logger.error(f"Error generating workflow feedback: {str(e)}") + return "Workflow completed successfully." \ No newline at end of file diff --git a/modules/workflow/managerDocument.py b/modules/workflow/managerDocument.py index 72c13af3..79e215c9 100644 --- a/modules/workflow/managerDocument.py +++ b/modules/workflow/managerDocument.py @@ -1,478 +1,106 @@ -from typing import Dict, Any, Optional, List -import logging -import json -import os -from datetime import datetime, UTC -from pathlib import Path -import mimetypes -import hashlib -import shutil -import uuid -import base64 +""" +Document Manager Module for handling document operations and content extraction. +""" +import base64 +import logging +from typing import List, Optional, Dict, Any, Union +from pathlib import Path +import uuid + +from modules.interfaces.serviceChatModel import ( + ChatDocument, + TaskDocument, + ExtractedContent, + ContentItem, + ContentMetadata +) +from modules.workflow.serviceContainer import ServiceContainer from modules.workflow.processorDocument import DocumentProcessor -from modules.shared.configuration import APP_CONFIG -from modules.interfaces.serviceChatModel import ChatDocument, ChatContent logger = logging.getLogger(__name__) class DocumentManager: - """Document manager with enhanced operations and file handling""" + """Manager for document operations and content extraction""" - _instance = None + def __init__(self, serviceContainer: ServiceContainer): + self.service = serviceContainer + self._processor = DocumentProcessor() - @classmethod - def getInstance(cls): - """Return a singleton instance of the document manager.""" - if cls._instance is None: - cls._instance = cls() - return cls._instance - - def __init__(self): - """Initialize document manager""" - if DocumentManager._instance is not None: - raise RuntimeError("Singleton instance already exists - use getInstance()") + async def extractFromChatDocument(self, prompt: str, document: ChatDocument) -> ExtractedContent: + """ + Extract content from a ChatDocument with AI processing. + + Args: + prompt: Prompt for AI content extraction + document: The ChatDocument to process - self.processor = DocumentProcessor() - self.document_cache = {} - self.temp_dir = Path(APP_CONFIG.get('temp_dir', 'temp')) - self.output_dir = Path(APP_CONFIG.get('output_dir', 'output')) - self.service = None + Returns: + ExtractedContent containing the processed content + """ + # Convert ChatDocument to TaskDocument + taskDoc = await self._convertToTaskDocument(document) - async def initialize(self, context: Dict[str, Any], service=None) -> None: - """Initialize document manager with context and service""" - # Initialize processor - self.processor.initialize(context) + # Process document using processor + extractedContent = await self._processor.processDocument(taskDoc, prompt) - # Initialize service container - if service: - # Validate required interfaces - required_interfaces = ['base', 'msft', 'google'] - missing_interfaces = [] - for interface in required_interfaces: - if not hasattr(service, interface): - missing_interfaces.append(interface) + # Update the objectId and objectType to reference the original ChatDocument + extractedContent.objectId = document.id + extractedContent.objectType = "ChatDocument" + + return extractedContent + + async def extractFromTaskDocument(self, prompt: str, document: TaskDocument) -> ExtractedContent: + """ + Extract content directly from a task document. + + Args: + prompt: The prompt to use for content extraction + document: The task document to extract content from - if missing_interfaces: - logger.warning(f"Service container missing required interfaces: {', '.join(missing_interfaces)}") - return False - - self.service = service - - # Create directories if they don't exist - self.temp_dir.mkdir(parents=True, exist_ok=True) - self.output_dir.mkdir(parents=True, exist_ok=True) - - # Clear temporary directory - self._clear_temp_directory() - - def _clear_temp_directory(self) -> None: - """Clear temporary directory""" + Returns: + ExtractedContent containing the processed content + + Raises: + ValueError: If document is invalid + IOError: If file cannot be read + """ try: - if self.temp_dir.exists(): - shutil.rmtree(self.temp_dir) - self.temp_dir.mkdir(parents=True) + return await self._processor.processDocument(document, prompt) except Exception as e: - logger.error(f"Error clearing temp directory: {str(e)}") + logger.error(f"Error extracting from task document: {str(e)}") + raise - async def process_document(self, document: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: - """Process a document with context""" - try: - # Generate document ID if not present - if 'id' not in document: - document['id'] = self._generate_document_id(document) + async def _convertToTaskDocument(self, chatDoc: ChatDocument) -> TaskDocument: + """ + Convert a ChatDocument to a TaskDocument. + + Args: + chatDoc: The chat document to convert - # Process document content - processed = await self.processor.process_with_context(document, context) + Returns: + TaskDocument containing the converted data - # Add metadata - processed['metadata'] = { - 'processedAt': datetime.now(UTC).isoformat(), - 'processor': 'DocumentManager', - 'version': '1.0' - } - - # Cache document - self.document_cache[document['id']] = processed - - return processed - - except Exception as e: - logger.error(f"Error processing document: {str(e)}") - return { - 'id': document.get('id', ''), - 'error': str(e), - 'status': 'error' - } - - async def extract_content(self, file_id: str) -> Optional[ChatDocument]: - """Extract content from a file""" + Raises: + ValueError: If document is invalid + IOError: If file cannot be read + """ try: # Get file content - file_content = await self.get_file_content(file_id) - if not file_content: - return None - - # Get file metadata - file_metadata = await self.get_file_metadata(file_id) - if not file_metadata: - return None + fileContent = await self.service.functions.getFileData(chatDoc.fileId) + if not fileContent: + raise ValueError(f"Could not get content for file {chatDoc.fileId}") - # Create ChatDocument - return ChatDocument( + # Convert to base64 + base64Data = base64.b64encode(fileContent).decode('utf-8') + + return TaskDocument( id=str(uuid.uuid4()), - fileId=file_id, - filename=file_metadata.get("name", "Unknown"), - fileSize=file_metadata.get("size", 0), - content=file_content.decode('utf-8', errors='ignore'), - mimeType=file_metadata.get("mimeType", "text/plain") + filename=chatDoc.filename, + fileSize=chatDoc.fileSize, + mimeType=chatDoc.mimeType, + data=base64Data ) except Exception as e: - logger.error(f"Error extracting content from file {file_id}: {str(e)}") - return None - - async def get_file_content(self, file_id: str) -> Optional[bytes]: - """Get file content""" - try: - if not self.service or not self.service.functions: - logger.error("Service or functions not initialized") - return None - return self.service.functions.getFileData(file_id) - except Exception as e: - logger.error(f"Error getting file content for {file_id}: {str(e)}") - return None - - async def get_file_metadata(self, file_id: str) -> Optional[Dict[str, Any]]: - """Get file metadata""" - try: - if not self.service or not self.service.functions: - logger.error("Service or functions not initialized") - return None - return self.service.functions.getFile(file_id) - except Exception as e: - logger.error(f"Error getting file metadata for {file_id}: {str(e)}") - return None - - async def save_file(self, filename: str, content: bytes, mime_type: str) -> Optional[int]: - """Save a new file""" - try: - if not self.service or not self.service.base: - logger.error("Service or base interface not initialized") - return None - return await self.service.base.saveFile(filename, content, mime_type) - except Exception as e: - logger.error(f"Error saving file {filename}: {str(e)}") - return None - - async def delete_file(self, file_id: str) -> bool: - """Delete a file""" - try: - if not self.service or not self.service.functions: - logger.error("Service or functions not initialized") - return False - return self.service.functions.deleteFile(file_id) - except Exception as e: - logger.error(f"Error deleting file {file_id}: {str(e)}") - return False - - def convert_file_ref_to_id(self, ref: str) -> Optional[int]: - """Convert file reference to ID""" - try: - if isinstance(ref, str) and ';' in ref: - return int(ref.split(';')[1]) - return int(ref) - except Exception as e: - logger.error(f"Error converting file reference to ID: {str(e)}") - return None - - def convert_file_id_to_ref(self, file_id: str) -> Optional[str]: - """Convert file ID to reference""" - try: - if not self.service or not self.service.functions: - logger.error("Service or functions not initialized") - return None - - file = self.service.functions.getFile(file_id) - if not file: - return None - return f"{file.filename};{file_id}" - except Exception as e: - logger.error(f"Error converting file ID to reference: {str(e)}") - return None - - async def convert_data_format(self, data: Any, format: str) -> Any: - """Convert data between formats""" - try: - if format == 'json': - if isinstance(data, str): - return json.loads(data) - return json.dumps(data) - elif format == 'base64': - if isinstance(data, str): - return base64.b64encode(data.encode('utf-8')).decode('utf-8') - return base64.b64encode(data).decode('utf-8') - return data - except Exception as e: - logger.error(f"Error converting data format: {str(e)}") - return data - - async def create_agent_input_file_list(self, files: List[str]) -> List[Dict[str, Any]]: - """Create list of input files for agent processing""" - try: - input_files = [] - for file in files: - file_id = await self.convert_file_ref_to_id(file) - if file_id: - file_data = await self.get_file_metadata(file_id) - if file_data: - content = await self.get_file_content(file_id) - input_files.append({ - 'id': file_id, - 'name': file_data['name'], - 'mimeType': file_data['mimeType'], - 'content': content - }) - return input_files - except Exception as e: - logger.error(f"Error creating agent input file list: {str(e)}") - return [] - - async def save_agent_output_files(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Save output files from agent processing""" - try: - saved_files = [] - for file in files: - file_meta = await self.save_file( - filename=file['name'], - content=file['content'], - mimeType=file.get('mimeType', 'application/octet-stream') - ) - - if file_meta: - saved_files.append({ - 'id': file_meta, - 'name': file['name'], - 'mimeType': file.get('mimeType', 'application/octet-stream') - }) - return saved_files - except Exception as e: - logger.error(f"Error saving agent output files: {str(e)}") - return [] - - async def content_with_prompt(self, document: Dict[str, Any], prompt: str) -> Optional[Dict[str, Any]]: - """Extract content using AI with specific prompt""" - try: - # Get document content - chat_doc = await self.extract_content(document.get('id')) - if not chat_doc: - return None - - # Prepare content - content = chat_doc.content - mime_type = chat_doc.mimeType - - # Process large files in chunks - if len(content) > 100000: - chunks = self._split_content_into_chunks(content, mime_type) - extracted_chunks = [] - - for chunk in chunks: - chunk_result = await self._process_content_chunk(chunk, prompt) - if chunk_result: - extracted_chunks.append(chunk_result) - - return { - "content": self._merge_chunk_results(extracted_chunks), - "metadata": { - "original_size": len(content), - "chunks_processed": len(chunks), - "mime_type": mime_type - } - } - else: - result = await self._process_content_chunk(content, prompt) - return { - "content": result, - "metadata": { - "original_size": len(content), - "chunks_processed": 1, - "mime_type": mime_type - } - } - - except Exception as e: - logger.error(f"Error in content_with_prompt: {str(e)}") - return None - - def _split_content_into_chunks(self, content: str, mime_type: str) -> List[str]: - """Split content into manageable chunks""" - try: - if mime_type.startswith('text/'): - return [chunk.strip() for chunk in content.split('\n\n') if chunk.strip()] - elif mime_type == 'application/json': - data = json.loads(content) - if isinstance(data, list): - return [json.dumps(item) for item in data] - return [content] - else: - return [content[i:i+10000] for i in range(0, len(content), 10000)] - except Exception as e: - logger.error(f"Error splitting content: {str(e)}") - return [content] - - async def _process_content_chunk(self, chunk: str, prompt: str) -> Optional[str]: - """Process content chunk with AI""" - try: - if not self.service or not self.service.base: - logger.error("Service or base interface not initialized") - return None - - ai_prompt = f""" - Extract relevant information from this content based on the following prompt: - - PROMPT: {prompt} - - CONTENT: - {chunk} - - Return ONLY the extracted information in a clear, concise format. - """ - - response = await self.service.base.callAi([ - {"role": "system", "content": "You are an expert at extracting relevant information from documents."}, - {"role": "user", "content": ai_prompt} - ]) - - return response.strip() - - except Exception as e: - logger.error(f"Error processing content chunk: {str(e)}") - return None - - def _merge_chunk_results(self, chunks: List[str]) -> str: - """Merge processed content chunks""" - try: - chunks = [chunk for chunk in chunks if chunk and chunk.strip()] - return "\n\n".join(chunks) - except Exception as e: - logger.error(f"Error merging chunk results: {str(e)}") - return "" - - async def save_document(self, document: Dict[str, Any], format: str = 'json') -> str: - """Save document to output directory""" - try: - filename = f"{document['id']}.{format}" - filepath = self.output_dir / filename - - if format == 'json': - with open(filepath, 'w', encoding='utf-8') as f: - json.dump(document, f, indent=2) - else: - content = document.get('content', '') - if isinstance(content, str): - with open(filepath, 'w', encoding='utf-8') as f: - f.write(content) - else: - with open(filepath, 'wb') as f: - f.write(content) - - return str(filepath) - - except Exception as e: - logger.error(f"Error saving document: {str(e)}") + logger.error(f"Error converting chat document to task document: {str(e)}") raise - - async def load_document(self, filepath: str) -> Dict[str, Any]: - """Load document from file""" - try: - path = Path(filepath) - if not path.exists(): - raise FileNotFoundError(f"Document not found: {filepath}") - - format = path.suffix[1:].lower() - - if format == 'json': - with open(path, 'r', encoding='utf-8') as f: - document = json.load(f) - else: - mime_type = mimetypes.guess_type(filepath)[0] - if mime_type and mime_type.startswith('text/'): - with open(path, 'r', encoding='utf-8') as f: - content = f.read() - else: - with open(path, 'rb') as f: - content = f.read() - - document = { - 'id': path.stem, - 'content': content, - 'format': format, - 'mime_type': mime_type - } - - document['metadata'] = { - 'loadedAt': datetime.now(UTC).isoformat(), - 'filepath': str(path), - 'size': path.stat().st_size - } - - return document - - except Exception as e: - logger.error(f"Error loading document: {str(e)}") - raise - - async def convert_document(self, document: Dict[str, Any], target_format: str) -> Dict[str, Any]: - """Convert document to target format""" - try: - current_format = document.get('format', 'json') - - if current_format == 'json' and target_format == 'text': - content = json.dumps(document, indent=2) - return { - 'id': document['id'], - 'content': content, - 'format': 'text', - 'mime_type': 'text/plain' - } - elif current_format == 'text' and target_format == 'json': - try: - content = json.loads(document['content']) - return { - 'id': document['id'], - 'content': content, - 'format': 'json', - 'mime_type': 'application/json' - } - except json.JSONDecodeError: - return { - 'id': document['id'], - 'content': document['content'], - 'format': 'json', - 'mime_type': 'application/json' - } - else: - raise ValueError(f"Unsupported conversion: {current_format} to {target_format}") - - except Exception as e: - logger.error(f"Error converting document: {str(e)}") - raise - - def _generate_document_id(self, document: Dict[str, Any]) -> str: - """Generate unique document ID""" - if 'content' in document: - content = str(document['content']) - return hashlib.md5(content.encode()).hexdigest() - return f"doc_{int(datetime.now(UTC).timestamp())}" - - async def cleanup(self) -> None: - """Clean up temporary files and cache""" - try: - self._clear_temp_directory() - self.document_cache.clear() - except Exception as e: - logger.error(f"Error during cleanup: {str(e)}") - -# Singleton factory for the document manager -def getDocumentManager(): - return DocumentManager.getInstance() \ No newline at end of file diff --git a/modules/workflow/managerPrompt.py b/modules/workflow/managerPrompt.py deleted file mode 100644 index 82e441cd..00000000 --- a/modules/workflow/managerPrompt.py +++ /dev/null @@ -1,182 +0,0 @@ -from typing import Dict, Any, List, Optional -import logging -import json -from datetime import datetime, UTC - -logger = logging.getLogger(__name__) - -class AIPromptManager: - """Manages AI prompts and response validation""" - - def __init__(self): - self.prompt_templates = {} - self.response_schemas = {} - self._load_templates() - - def _load_templates(self) -> None: - """Load prompt templates and schemas""" - # Basic templates - self.prompt_templates = { - "task_analysis": { - "template": """Analyze the following task and determine required actions: - Task: {task} - Context: {context} - Available Methods: {methods} - - Please provide: - 1. Main objective - 2. Required actions - 3. Required data sources - 4. Document processing requirements - 5. Expected output format - - Format your response as JSON: - {{ - "objective": "string", - "actions": [ - {{ - "method": "string", - "action": "string", - "parameters": {{ - "param1": "value1" - }} - }} - ], - "dataSources": ["string"], - "documentRequirements": ["string"], - "outputFormat": "string" - }} - """, - "schema": { - "type": "object", - "required": ["objective", "actions"], - "properties": { - "objective": {"type": "string"}, - "actions": { - "type": "array", - "items": { - "type": "object", - "required": ["method", "action"], - "properties": { - "method": {"type": "string"}, - "action": {"type": "string"}, - "parameters": {"type": "object"} - } - } - }, - "dataSources": { - "type": "array", - "items": {"type": "string"} - }, - "documentRequirements": { - "type": "array", - "items": {"type": "string"} - }, - "outputFormat": {"type": "string"} - } - } - }, - "result_analysis": { - "template": """Analyze the following task results and determine next steps: - Task Results: {results} - Workflow History: {history} - - Please provide: - 1. Task completion status - 2. Next required actions - 3. Required documents - 4. Method recommendations - - Format your response as JSON: - {{ - "isComplete": boolean, - "nextActions": ["string"], - "requiredDocuments": ["string"], - "recommendedMethods": ["string"] - }} - """, - "schema": { - "type": "object", - "required": ["isComplete"], - "properties": { - "isComplete": {"type": "boolean"}, - "nextActions": { - "type": "array", - "items": {"type": "string"} - }, - "requiredDocuments": { - "type": "array", - "items": {"type": "string"} - }, - "recommendedMethods": { - "type": "array", - "items": {"type": "string"} - } - } - } - } - } - - def generate_prompt(self, context: Dict[str, Any], examples: List[Dict]) -> str: - """Generate a context-aware prompt with few-shot examples""" - try: - # Get template - template = self.prompt_templates.get(context.get("type", "task_analysis")) - if not template: - raise ValueError(f"Unknown prompt type: {context.get('type')}") - - # Format prompt - prompt = template["template"].format( - task=context.get("task", ""), - context=json.dumps(context.get("context", {}), indent=2), - methods=json.dumps(context.get("methods", {}), indent=2), - results=json.dumps(context.get("results", {}), indent=2), - history=json.dumps(context.get("history", []), indent=2) - ) - - # Add examples if provided - if examples: - prompt += "\nExamples:\n" - for ex in examples: - prompt += f"- {ex['input']} => {ex['output']}\n" - - return prompt - - except Exception as e: - logger.error(f"Error generating prompt: {str(e)}") - raise - - def validate_response(self, response: str, schema: Dict) -> bool: - """Validate AI response against a schema""" - try: - # Parse response - if isinstance(response, str): - try: - response = json.loads(response) - except json.JSONDecodeError: - return False - - # Validate against schema - import jsonschema - jsonschema.validate(instance=response, schema=schema) - return True - - except Exception as e: - logger.error(f"Error validating response: {str(e)}") - return False - - def get_schema(self, prompt_type: str) -> Optional[Dict]: - """Get schema for prompt type""" - template = self.prompt_templates.get(prompt_type) - return template.get("schema") if template else None - - def add_template(self, name: str, template: str, schema: Dict) -> None: - """Add new prompt template""" - self.prompt_templates[name] = { - "template": template, - "schema": schema - } - - def remove_template(self, name: str) -> None: - """Remove prompt template""" - self.prompt_templates.pop(name, None) \ No newline at end of file diff --git a/modules/workflow/managerWorkflow.py b/modules/workflow/managerWorkflow.py index 86ecb1ae..f035adef 100644 --- a/modules/workflow/managerWorkflow.py +++ b/modules/workflow/managerWorkflow.py @@ -1,239 +1,147 @@ -from typing import Dict, Any, Optional, List +from typing import Dict, Any import logging -import json -import asyncio from datetime import datetime, UTC import uuid +from modules.interfaces.serviceChatModel import ( + AgentTask, AgentResult, TaskStatus, ChatMessage, + UserInputRequest, ChatWorkflow, ChatDocument +) +from modules.interfaces.serviceChatClass import ChatInterface from modules.workflow.managerChat import ChatManager -from modules.workflow.managerDocument import DocumentManager -from modules.interfaces.serviceChatModel import AgentTask, TaskStatus, ActionStatus -from modules.shared.configuration import APP_CONFIG logger = logging.getLogger(__name__) +class WorkflowStoppedException(Exception): + """Exception raised when workflow is stopped by user""" + pass + class WorkflowManager: - """Workflow manager with improved task management and error recovery""" + """Manages workflow execution lifecycle""" - def __init__(self): - self.chat_manager = ChatManager() - self.document_manager = DocumentManager() + def __init__(self, chatInterface: ChatInterface): self.workflow = None - self.context = {} - self.task_queue = asyncio.Queue() - self.active_tasks = {} - self.task_history = [] - - async def initialize(self, workflow: Any, context: Dict[str, Any]) -> None: - """Initialize workflow manager with workflow and context""" - self.workflow = workflow - self.context = context - - # Initialize managers - await self.chat_manager.initialize(workflow, context) - await self.document_manager.initialize(context) - - # Start task processor - asyncio.create_task(self._process_task_queue()) - - async def process_workflow(self, user_input: Dict[str, Any]) -> Dict[str, Any]: - """Process workflow with user input""" + self.isRunning = False + self.chatInterface = chatInterface + self.chatManager = ChatManager() + + def _checkWorkflowStopped(self, workflow: ChatWorkflow) -> None: + if workflow.status == "stopped": + logger.info(f"Workflow {workflow.id} stopped by user") + raise WorkflowStoppedException("User stopped workflow") + + async def workflowProcess(self, userInput: UserInputRequest, workflow: ChatWorkflow) -> None: + """Main workflow execution process""" try: + self.workflow = workflow + self.isRunning = True + + # Process documents from userInput using ChatInterface's method + documents = [] + if userInput.listFileId: + documents = await self.chatInterface.processFileIds(userInput.listFileId) + + # Create initial ChatMessage from userInput + initialMessage = ChatMessage( + id=str(uuid.uuid4()), + workflowId=workflow.id, + role="user", + message=userInput.prompt, + status="first", # First message in workflow + documents=documents + ) + + # Add message to workflow + await self.chatInterface.createWorkflowMessage(initialMessage.dict()) + # Create initial task - task = await self.chat_manager.create_initial_task(user_input) + task = await self.chatInterface.createInitialTask(workflow, initialMessage) + if not task: + logger.error("Failed to create initial task") + workflow.status = "error" + workflow.error = "Failed to create initial task" + return - # Add to queue - await self.task_queue.put(task) - - # Wait for completion - while not task.is_complete() and not task.has_failed(): - await asyncio.sleep(0.1) - - # Process results - if task.status == TaskStatus.SUCCESS: - return { - "status": "success", - "result": task.result, - "documents": task.documentsOutput - } - else: - return { - "status": "error", - "error": task.error, - "feedback": task.thisTaskFeedback - } - - except Exception as e: - logger.error(f"Error processing workflow: {str(e)}") - return { - "status": "error", - "error": str(e) - } - - async def _process_task_queue(self) -> None: - """Process tasks in queue""" - while True: - try: - # Get task from queue - task = await self.task_queue.get() - - # Process task - result = await self.chat_manager.process_task(task) - - # Update task status - if result["status"] == "success": - task.status = TaskStatus.SUCCESS - task.result = result.get("result") - task.documentsOutput = result.get("documents", []) - else: - task.status = TaskStatus.FAILED - task.error = result.get("error") - - # Add to history - self.task_history.append({ - "id": task.id, - "status": task.status, - "startedAt": task.startedAt, - "finishedAt": datetime.now(UTC).isoformat(), - "error": task.error - }) - - # Check for next task - if not task.is_complete(): - next_task = await self._define_next_task(task) - if next_task: - await self.task_queue.put(next_task) - - # Mark task as done - self.task_queue.task_done() - - except Exception as e: - logger.error(f"Error processing task queue: {str(e)}") - await asyncio.sleep(1) # Prevent tight loop on error - - async def _define_next_task(self, current_task: AgentTask) -> Optional[AgentTask]: - """Define next task based on current task results""" - try: - # Analyze current task - analysis = await self.chat_manager._analyze_task_results(current_task) - - # Check if next task needed - if not analysis.get("isComplete", True): - # Create next task - next_task = await self.chat_manager.create_next_task( - current_task, - analysis.get("nextActions", []), - analysis.get("requiredDocuments", []) + # Main workflow loop + while self.isRunning and workflow.status == "running": + + self._checkWorkflowStopped(workflow) + + # Execute task + result = AgentResult( + id=task.id, + status=TaskStatus.PENDING, + createdAt=datetime.now(UTC), + updatedAt=datetime.now(UTC) ) - # Add dependencies - next_task.dependencies = [current_task.id] - - return next_task - - return None - - except Exception as e: - logger.error(f"Error defining next task: {str(e)}") - return None - - async def handle_error(self, task: AgentTask, error: str) -> None: - """Handle task error with recovery strategies""" - try: - # Log error - logger.error(f"Task {task.id} failed: {error}") - - # Update task status - task.status = TaskStatus.FAILED - task.error = error - - # Check for retryable errors - if self._is_retryable_error(error): - if task.retryCount < task.retryMax: - # Retry task - task.retryCount += 1 - task.status = TaskStatus.RETRY - await self.task_queue.put(task) - return - - # Check for rollback needed - if task.rollback_on_failure: - await self._rollback_task(task) - - # Notify workflow - self.workflow.status = "error" - self.workflow.error = error - - except Exception as e: - logger.error(f"Error handling task error: {str(e)}") - - async def _rollback_task(self, task: AgentTask) -> None: - """Rollback task actions""" - try: - for action in task.actionList: - if action.status == ActionStatus.SUCCESS: - # Get method - method = self.chat_manager.service.methods.get(action.method) - if method: - # Rollback action - await method.rollback( - action.action, - action.parameters, - task.get_auth_data(action.auth_source) - ) - - except Exception as e: - logger.error(f"Error rolling back task: {str(e)}") - - def _is_retryable_error(self, error: str) -> bool: - """Check if error is retryable""" - retryable_errors = [ - "timeout", - "rate limit", - "temporary", - "connection", - "server error" - ] - return any(err in error.lower() for err in retryable_errors) - - async def cleanup(self) -> None: - """Clean up workflow resources""" - try: - # Clean up managers - await self.chat_manager.cleanup() - await self.document_manager.cleanup() - - # Clear task queue - while not self.task_queue.empty(): - self.task_queue.get_nowait() - self.task_queue.task_done() - - # Clear active tasks - self.active_tasks.clear() - - except Exception as e: - logger.error(f"Error during cleanup: {str(e)}") + # Execute each action + for action in task.actionList: - async def get_workflow_status(self, workflow_id: str) -> Dict[str, Any]: - """Get current status of workflow""" - current_task = self.chat_manager.service.tasks.get('current') - previous_task = self.chat_manager.service.tasks.get('previous') - - return { - 'workflowId': workflow_id, - 'currentTask': current_task.dict() if current_task else None, - 'previousTask': previous_task.dict() if previous_task else None, - 'status': self.chat_manager.workflow.status if self.chat_manager.workflow else None - } - - async def stop_workflow(self, workflow_id: str) -> None: - """Stop workflow execution""" - if self.chat_manager.workflow and self.chat_manager.workflow.id == workflow_id: - self.chat_manager.workflow.status = TaskStatus.STOPPED - self.chat_manager.workflow.updatedAt = datetime.now(UTC) + self._checkWorkflowStopped(workflow) + + try: + # Execute action + actionResult = await action.execute() + + # Update action status + action.status = TaskStatus.COMPLETED if actionResult.success else TaskStatus.FAILED + action.result = actionResult + + # Check for failure + if not actionResult.success: + result.status = TaskStatus.FAILED + result.error = actionResult.error + break + + except Exception as e: + logger.error(f"Action error: {str(e)}") + action.status = TaskStatus.FAILED + result.status = TaskStatus.FAILED + result.error = str(e) + break + + # Update result status + if result.status != TaskStatus.FAILED: + result.status = TaskStatus.COMPLETED + + result.updatedAt = datetime.now(UTC) + + self._checkWorkflowStopped(workflow) + + # Update workflow with result + await self.chatInterface.addTaskResult(workflow, result) + + # Get next task + task = await self.chatInterface.getNextTask(workflow) + if not task: + break + + # Check if should continue + if not await self.chatInterface.shouldContinue(workflow): + break - # Stop current task if any - current_task = self.chat_manager.service.tasks.get('current') - if current_task: - current_task.status = TaskStatus.STOPPED - current_task.updatedAt = datetime.now(UTC) \ No newline at end of file + # Generate final feedback message using ChatManager + finalFeedback = await self.chatManager.generateWorkflowFeedback(workflow) + + # Create final message with "last" status + self._checkWorkflowStopped(workflow) + finalMessage = ChatMessage( + id=str(uuid.uuid4()), + workflowId=workflow.id, + role="assistant", + message=finalFeedback, + status="last" # Last message in workflow + ) + await self.chatInterface.createWorkflowMessage(finalMessage.dict()) + + # Complete workflow + if workflow.status != "failed": + workflow.status = "completed" + workflow.lastActivity = datetime.now(UTC).isoformat() + + except Exception as e: + logger.error(f"Workflow error: {str(e)}") + if self.workflow: + self.workflow.status = "error" + self.workflow.error = str(e) \ No newline at end of file diff --git a/modules/workflow/processorDocument.py b/modules/workflow/processorDocument.py index 9e4450a2..4a72f9ea 100644 --- a/modules/workflow/processorDocument.py +++ b/modules/workflow/processorDocument.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, List, Optional, Union, Tuple, TypedDict +from typing import Dict, Any, List, Optional, Union, Tuple, TypedDict, Callable, Awaitable import logging import json import os @@ -6,20 +6,18 @@ import io import base64 from datetime import datetime, UTC from pathlib import Path -import mimetypes -import hashlib -import shutil -import re -import uuid from modules.interfaces.serviceChatModel import ( - DocumentContext, - DocumentExtraction, - DocumentMetadata, - DocumentContent, - ProcessedDocument, - ImageData + ChatDocument, + TaskDocument, + ExtractedContent, + ContentItem, + ContentMetadata ) +from modules.interfaces.serviceManagementClass import ServiceManagement, getInterface +from modules.interfaces.serviceAppModel import User +from modules.neutralizer.neutralizer import DataAnonymizer +from modules.shared.configuration import APP_CONFIG logger = logging.getLogger(__name__) @@ -33,29 +31,32 @@ class FileProcessingError(Exception): pass class DocumentProcessor: - """Processes documents with context awareness""" + """Processor for handling document operations and content extraction.""" - def __init__(self): - self.supported_types = { - "text/plain": self._process_text, - "text/csv": self._process_csv, - "application/json": self._process_json, - "text/html": self._process_html, - "image/svg+xml": self._process_svg, - "application/pdf": self._process_pdf, - "application/vnd.openxmlformats-officedocument.wordprocessingml.document": self._process_docx, - "application/msword": self._process_docx, - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": self._process_xlsx, - "application/vnd.ms-excel": self._process_xlsx, - "application/vnd.openxmlformats-officedocument.presentationml.presentation": self._process_pptx, - "application/vnd.ms-powerpoint": self._process_pptx + def __init__(self, currentUser: Optional[User] = None): + """Initialize the document processor.""" + self.serviceManagement = getInterface(currentUser) + self._neutralizer = DataAnonymizer() if APP_CONFIG.get("ENABLE_CONTENT_NEUTRALIZATION", False) else None + self.supportedTypes: Dict[str, Callable[[Union[ChatDocument, TaskDocument]], Awaitable[List[ContentItem]]]] = { + 'text/plain': self._processText, + 'text/csv': self._processCsv, + 'application/json': self._processJson, + 'application/xml': self._processXml, + 'text/html': self._processHtml, + 'image/svg+xml': self._processSvg, + 'image/jpeg': self._processImage, + 'image/png': self._processImage, + 'image/gif': self._processImage, + 'application/pdf': self._processPdf, + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._processDocx, + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': self._processXlsx } - - # Add image types - for img_type in ["image/jpeg", "image/png", "image/gif", "image/bmp", "image/tiff"]: - self.supported_types[img_type] = self._process_image - - def _load_pdf_extractor(self): + + def initialize(self) -> None: + """Initialize the document processor.""" + pass + + def _loadPdfExtractor(self): """Loads PDF extraction libraries when needed""" global pdfExtractorLoaded if not pdfExtractorLoaded: @@ -67,8 +68,8 @@ class DocumentProcessor: logger.info("PDF extraction libraries successfully loaded") except ImportError as e: logger.warning(f"PDF extraction libraries could not be loaded: {e}") - - def _load_office_extractor(self): + + def _loadOfficeExtractor(self): """Loads Office document extraction libraries when needed""" global officeExtractorLoaded if not officeExtractorLoaded: @@ -80,8 +81,8 @@ class DocumentProcessor: logger.info("Office extraction libraries successfully loaded") except ImportError as e: logger.warning(f"Office extraction libraries could not be loaded: {e}") - - def _load_image_processor(self): + + def _loadImageProcessor(self): """Loads image processing libraries when needed""" global imageProcessorLoaded if not imageProcessorLoaded: @@ -92,50 +93,95 @@ class DocumentProcessor: logger.info("Image processing libraries successfully loaded") except ImportError as e: logger.warning(f"Image processing libraries could not be loaded: {e}") - - def process_with_context(self, doc: Dict[str, Any], context: DocumentContext) -> ProcessedDocument: - """Process document with context""" + + async def processDocument(self, document: TaskDocument, prompt: str) -> ExtractedContent: + """ + Process a document and extract its contents with AI processing. + + Args: + document: The document to process + prompt: Prompt for AI content extraction + + Returns: + ExtractedContent containing the processed content + + Raises: + FileProcessingError: If document processing fails + """ try: # Get content type - content_type = doc.get("contentType", "text/plain") - if content_type == "application/octet-stream": + contentType = document.mimeType + if contentType == "application/octet-stream": # Try to detect actual file type - content_type = self._detect_content_type(doc) - - if content_type not in self.supported_types: + contentType = self._detectContentType(document) + + if contentType not in self.supportedTypes: # Fallback to binary processing - return self._process_binary(doc, context) - - # Process document - processor = self.supported_types[content_type] - extracted = processor(doc, context) + contentItems = await self._processBinary(document) + else: + # Process document based on type + processor = self.supportedTypes[contentType] + contentItems = await processor(document) - # Track extraction - self._track_extraction(doc, extracted, context) + # Process with AI if prompt provided + if prompt and contentItems: + try: + # Process each content item with AI + processedItems = [] + for item in contentItems: + # Neutralize content if neutralizer is enabled + contentToProcess = item.data + if self._neutralizer and contentToProcess: + contentToProcess = self._neutralizer.neutralize(contentToProcess) + + # Create AI prompt for this content + aiPrompt = f""" + Extract relevant information from this content based on the following prompt: + + PROMPT: {prompt} + + CONTENT: + {contentToProcess} + + Return ONLY the extracted information in a clear, concise format. + """ + + # Get AI response + response = await self.serviceManagement.callAi([ + {"role": "system", "content": "You are an expert at extracting relevant information from documents."}, + {"role": "user", "content": aiPrompt} + ]) + + # Update content with AI processed data + processedItems.append(ContentItem( + label=item.label, + data=response.strip(), + metadata=item.metadata + )) + + contentItems = processedItems + + except Exception as e: + logger.error(f"Error processing content with AI: {str(e)}") - # Create ProcessedDocument - return ProcessedDocument( - id=doc.get("id", str(uuid.uuid4())), - name=doc.get("name", "Unknown"), - contentType=content_type, - content=extracted, - context=context + return ExtractedContent( + objectId=document.id, + objectType="TaskDocument", + contents=contentItems ) except Exception as e: logger.error(f"Error processing document: {str(e)}") - # Fallback to binary processing - return self._process_binary(doc, context) - - def _detect_content_type(self, doc: Dict[str, Any]) -> str: + raise FileProcessingError(f"Failed to process document: {str(e)}") + + def _detectContentType(self, document: Union[ChatDocument, TaskDocument]) -> str: """Detect content type from file content""" try: # Check file extension first - file_name = doc.get("name", "") - ext = os.path.splitext(file_name)[1].lower() + ext = os.path.splitext(document.filename)[1].lower() if ext: # Map common extensions to MIME types - ext_to_mime = { + extToMime = { '.txt': 'text/plain', '.md': 'text/markdown', '.csv': 'text/csv', @@ -145,7 +191,6 @@ class DocumentProcessor: '.py': 'application/x-python', '.svg': 'image/svg+xml', '.jpg': 'image/jpeg', - '.jpeg': 'image/jpeg', '.png': 'image/png', '.gif': 'image/gif', '.pdf': 'application/pdf', @@ -156,449 +201,450 @@ class DocumentProcessor: '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', '.ppt': 'application/vnd.ms-powerpoint' } - if ext in ext_to_mime: - return ext_to_mime[ext] - + if ext in extToMime: + return extToMime[ext] + # Try to detect if it's text content - content = doc.get("content", "") - if isinstance(content, bytes): + if isinstance(document, TaskDocument): try: + content = base64.b64decode(document.data) content.decode('utf-8') return 'text/plain' except UnicodeDecodeError: pass - + return 'application/octet-stream' except Exception as e: logger.error(f"Error detecting content type: {str(e)}") return 'application/octet-stream' - - def _process_text(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent: + + async def _processText(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]: """Process text document""" - content = doc.get("content", "") - if isinstance(content, bytes): - try: + try: + if isinstance(document, TaskDocument): + content = base64.b64decode(document.data).decode('utf-8') + else: + content = self.serviceManagement.getFileData(document.fileId) + if content is None: + raise FileProcessingError(f"Could not get file data for {document.fileId}") content = content.decode('utf-8') - except UnicodeDecodeError: - # Try alternative encodings - for encoding in ['latin-1', 'cp1252', 'iso-8859-1']: - try: - content = content.decode(encoding) - break - except UnicodeDecodeError: - continue - - sections = self._extract_sections(content) - return DocumentContent( - text=content, - metadata=DocumentMetadata( - type="text", - format="text", - size=len(content.encode('utf-8')), - sections=sections - ) - ) - - def _process_csv(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent: + + return [ContentItem( + label="main", + data=content, + metadata=ContentMetadata( + size=len(content.encode('utf-8')), + pages=1 + ) + )] + except Exception as e: + logger.error(f"Error processing text document: {str(e)}") + raise FileProcessingError(f"Failed to process text document: {str(e)}") + + async def _processCsv(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]: """Process CSV document""" - content = doc.get("content", "") - if isinstance(content, bytes): - content = content.decode('utf-8') + try: + if isinstance(document, TaskDocument): + content = base64.b64decode(document.data).decode('utf-8') + else: + content = self.serviceManagement.getFileData(document.fileId) + if content is None: + raise FileProcessingError(f"Could not get file data for {document.fileId}") + content = content.decode('utf-8') - return DocumentContent( - text=content, - metadata=DocumentMetadata( - type="csv", - format="csv", - size=len(content.encode('utf-8')), - sections=[f"Row {i+1}" for i in range(len(content.splitlines()))] - ) - ) - - def _process_json(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent: + return [ContentItem( + label="main", + data=content, + metadata=ContentMetadata( + size=len(content.encode('utf-8')), + pages=1 + ) + )] + except Exception as e: + logger.error(f"Error processing CSV document: {str(e)}") + raise FileProcessingError(f"Failed to process CSV document: {str(e)}") + + async def _processJson(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]: """Process JSON document""" - content = doc.get("content", {}) - if isinstance(content, str): - content = json.loads(content) - elif isinstance(content, bytes): - content = json.loads(content.decode('utf-8')) + try: + if isinstance(document, TaskDocument): + content = base64.b64decode(document.data).decode('utf-8') + else: + content = self.serviceManagement.getFileData(document.fileId) + if content is None: + raise FileProcessingError(f"Could not get file data for {document.fileId}") + content = content.decode('utf-8') - structure = self._analyze_structure(content) - return DocumentContent( - data=content, - metadata=DocumentMetadata( - type="json", - format="json", - size=len(json.dumps(content).encode('utf-8')), - sections=list(content.keys()) if isinstance(content, dict) else [] - ) - ) - - def _process_html(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent: + # Parse JSON to validate + jsonData = json.loads(content) + + return [ContentItem( + label="main", + data=content, + metadata=ContentMetadata( + size=len(content.encode('utf-8')), + pages=1 + ) + )] + except Exception as e: + logger.error(f"Error processing JSON document: {str(e)}") + raise FileProcessingError(f"Failed to process JSON document: {str(e)}") + + async def _processXml(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]: + """Process XML document""" + try: + if isinstance(document, TaskDocument): + content = base64.b64decode(document.data).decode('utf-8') + else: + content = self.serviceManagement.getFileData(document.fileId) + if content is None: + raise FileProcessingError(f"Could not get file data for {document.fileId}") + content = content.decode('utf-8') + + return [ContentItem( + label="main", + data=content, + metadata=ContentMetadata( + size=len(content.encode('utf-8')), + pages=1 + ) + )] + except Exception as e: + logger.error(f"Error processing XML document: {str(e)}") + raise FileProcessingError(f"Failed to process XML document: {str(e)}") + + async def _processHtml(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]: """Process HTML document""" - content = doc.get("content", "") - if isinstance(content, bytes): - content = content.decode('utf-8') + try: + if isinstance(document, TaskDocument): + content = base64.b64decode(document.data).decode('utf-8') + else: + content = self.serviceManagement.getFileData(document.fileId) + if content is None: + raise FileProcessingError(f"Could not get file data for {document.fileId}") + content = content.decode('utf-8') - return DocumentContent( - text=content, - metadata=DocumentMetadata( - type="html", - format="html", - size=len(content.encode('utf-8')), - sections=[ - self._extract_title(content) or "Untitled", - *self._extract_links(content), - *self._extract_images(content) - ] - ) - ) - - def _process_svg(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent: + return [ContentItem( + label="main", + data=content, + metadata=ContentMetadata( + size=len(content.encode('utf-8')), + pages=1 + ) + )] + except Exception as e: + logger.error(f"Error processing HTML document: {str(e)}") + raise FileProcessingError(f"Failed to process HTML document: {str(e)}") + + async def _processSvg(self, document: Union[ChatDocument, TaskDocument]) -> List[ContentItem]: """Process SVG document""" - content = doc.get("content", "") - if isinstance(content, bytes): - content = content.decode('utf-8') + try: + if isinstance(document, TaskDocument): + content = base64.b64decode(document.data).decode('utf-8') + else: + content = self.serviceManagement.getFileData(document.fileId) + if content is None: + raise FileProcessingError(f"Could not get file data for {document.fileId}") + content = content.decode('utf-8') - # Check if it's actually SVG - is_svg = "