diff --git a/modules/_backup_chat_agent_coder copy.py b/modules/_backup_chat_agent_coder copy.py deleted file mode 100644 index 98b16a6a..00000000 --- a/modules/_backup_chat_agent_coder copy.py +++ /dev/null @@ -1,814 +0,0 @@ -""" -Coder agent for development and execution of Python code. -Optimized for the new task-based processing. -""" - -import logging -import json -import re -import uuid -import os -import subprocess -import tempfile -import shutil -import sys -from typing import Dict, Any, List, Optional, Tuple - -from modules.chat_registry import AgentBase - -logger = logging.getLogger(__name__) - - -class AgentCoder(AgentBase): - """Agent for development and execution of Python code""" - - def __init__(self): - """Initialize the coder agent""" - super().__init__() - self.name = "coder" - self.description = "Develops and executes Python code for data processing and automation" - self.capabilities = [ - "code_development", - "data_processing", - "file_processing", - "automation", - "code_execution" - ] - - # Executor settings - self.executor_timeout = 60 # seconds - self.executor_memory_limit = 512 # MB - - # AI service settings - self.ai_temperature = 0.1 # Lower temperature for deterministic code generation - - # Auto-correction settings - self.max_correction_attempts = 3 # Maximum number of correction attempts - - def set_dependencies(self, ai_service=None): - """Set external dependencies for the agent.""" - self.ai_service = ai_service - - async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]: - """ - Process a standardized task structure and perform code development/execution. - - Args: - task: A dictionary containing: - - task_id: Unique ID for this task - - prompt: The main instruction for the agent - - input_documents: List of documents to process - - output_specifications: List of required output documents - - context: Additional contextual information - - Returns: - A dictionary containing: - - feedback: Text response explaining the code execution - - documents: List of created document objects - """ - try: - # Extract relevant task information - prompt = task.get("prompt", "") - input_documents = task.get("input_documents", []) - output_specs = task.get("output_specifications", []) - context_info = task.get("context", {}) - - # Check if AI service is available - if not self.ai_service: - logger.error("No AI service configured for the Coder agent") - return { - "feedback": "The Coder agent is not properly configured.", - "documents": [] - } - - # Extract context from input documents - document_context = self._extract_document_context(input_documents) - - # Generate code based on the prompt and document context - logger.info("Generating code based on the task") - code_to_execute, requirements = await self._generate_code_from_prompt(prompt, document_context) - - if not code_to_execute: - logger.warning("AI couldn't generate any code") - return { - "feedback": "I couldn't generate executable code based on the task. Please provide more detailed instructions.", - "documents": [] - } - - logger.info(f"Code generated with AI ({len(code_to_execute)} characters)") - - # Collect created documents - generated_documents = [] - - # Add code as first document - code_doc = { - "label": "generated_code.py", - "content": code_to_execute - } - generated_documents.append(code_doc) - - # Execute code with auto-correction loop - execution_context = { - "input_documents": input_documents, - "task": task - } - - # Enhanced execution with auto-correction - result, attempts_info = await self._execute_with_auto_correction( - code_to_execute, - requirements, - execution_context, - prompt # Original prompt/message - ) - - # Create output documents based on execution result and output specifications - if result.get("success", False): - # Code execution successful - output = result.get("output", "") - execution_result = result.get("result") - logger.info("Code executed successfully") - - # Determine output type of the result - result_docs = self._generate_result_documents( - attempts_info[-1]["code"], # Last successful code - output, - execution_result, - output_specs - ) - - # Add result documents - generated_documents.extend(result_docs) - - # Create feedback for successful execution - feedback = f"I successfully executed the code and generated {len(result_docs)} output files." - if attempts_info and len(attempts_info) > 1: - feedback += f" (This required {len(attempts_info)-1} correction attempts)" - - else: - # Code execution failed after all attempts - error = result.get("error", "Unknown error") - logger.error(f"Error in code execution after all correction attempts: {error}") - - # Add error log as additional document - error_doc = { - "label": "execution_error.txt", - "content": f"Execution error:\n\n{error}" - } - generated_documents.append(error_doc) - - # Create feedback for failed execution - feedback = f"An error occurred during code execution after {len(attempts_info)} correction attempts." - - # If no specific outputs requested, create standard outputs - if not output_specs and result.get("success", False): - # Add standard output document - output_doc = { - "label": "execution_output.txt", - "content": output - } - generated_documents.append(output_doc) - - # If a result is available, also add as JSON document - if execution_result: - result_json = json.dumps(execution_result, indent=2) if isinstance(execution_result, (dict, list)) else str(execution_result) - result_doc = { - "label": "execution_result.json", - "content": result_json - } - generated_documents.append(result_doc) - - return { - "feedback": feedback, - "documents": generated_documents - } - - except Exception as e: - error_msg = f"Error during processing by the Coder agent: {str(e)}" - logger.error(error_msg) - return { - "feedback": f"An error occurred during code processing: {str(e)}", - "documents": [] - } - - def _extract_document_context(self, documents: List[Dict[str, Any]]) -> str: - """ - Extract context from input documents for code generation. - - Args: - documents: List of document objects - - Returns: - Extracted context as text - """ - context_parts = [] - - for doc in documents: - doc_name = doc.get("name", "Unnamed document") - context_parts.append(f"--- {doc_name} ---") - - for content in doc.get("contents", []): - if content.get("metadata", {}).get("is_text", False): - context_parts.append(content.get("data", "")) - - return "\n\n".join(context_parts) - - def _generate_result_documents(self, code: str, output: str, execution_result: Any, - output_specs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """ - Generate output documents based on execution results and specifications. - - Args: - code: Executed code - output: Text output of the execution - execution_result: Result object from execution - output_specs: Output specifications - - Returns: - List of generated document objects - """ - documents = [] - - # If no specific outputs requested - if not output_specs: - return documents - - # Generate appropriate document for each requested output - for spec in output_specs: - output_label = spec.get("label", "") - output_description = spec.get("description", "") - - # Determine output type based on file extension - format_type = self._determine_format_type(output_label) - - # Generate document content based on format and output - if "code" in output_label.lower() or format_type in ["py", "js", "html", "css"]: - # Code document - documents.append({ - "label": output_label, - "content": code - }) - elif "output" in output_label.lower() or format_type == "txt": - # Output document - documents.append({ - "label": output_label, - "content": output - }) - elif format_type in ["json", "yml", "yaml"] and execution_result: - # JSON result document - if isinstance(execution_result, (dict, list)): - content = json.dumps(execution_result, indent=2) - else: - content = str(execution_result) - - documents.append({ - "label": output_label, - "content": content - }) - else: - # Generic result document (fallback) - result_str = "" - if execution_result: - if isinstance(execution_result, (dict, list)): - result_str = json.dumps(execution_result, indent=2) - else: - result_str = str(execution_result) - - documents.append({ - "label": output_label, - "content": f"Code output:\n\n{output}\n\nResult:\n\n{result_str}" - }) - - return documents - - def _determine_format_type(self, output_label: str) -> str: - """ - Determine the format type based on the filename. - - Args: - output_label: Output filename - - Returns: - Format type (py, js, json, txt, etc.) - """ - if not '.' in output_label: - return "txt" # Default format - - extension = output_label.split('.')[-1].lower() - return extension - - async def _execute_with_auto_correction( - self, - initial_code: str, - requirements: List[str], - context: Dict[str, Any], - original_prompt: str - ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]: - """ - Execute code with automatic error correction and retry attempts. - - Args: - initial_code: The initial Python code - requirements: List of required packages - context: Additional context for execution - original_prompt: The original user request/prompt - - Returns: - Tuple of (final execution result, list of attempt info dictionaries) - """ - # Initialize tracking data - current_code = initial_code - current_requirements = requirements.copy() if requirements else [] - attempts_info = [] - - # Execute with correction loop - for attempt in range(1, self.max_correction_attempts + 1): - if attempt == 1: - logger.info(f"Executing code (attempt {attempt}/{self.max_correction_attempts})") - else: - logger.info(f"Executing corrected code (attempt {attempt}/{self.max_correction_attempts})") - - # Execute current code version - result = await self._execute_code(current_code, current_requirements, context) - - # Record attempt information - attempts_info.append({ - "attempt": attempt, - "code": current_code, - "error": result.get("error", ""), - "success": result.get("success", False) - }) - - # Check if execution was successful - if result.get("success", False): - # Success! Return result and attempt info - return result, attempts_info - - # Failed execution - check if max attempt limit reached - if attempt >= self.max_correction_attempts: - logger.warning(f"Maximum correction attempts ({self.max_correction_attempts}) reached") - break - - # Correct code based on the error - error_message = result.get("error", "Unknown error") - - logger.info(f"Attempting to fix code error: {error_message[:200]}...") - - # Generate corrected code - corrected_code, new_requirements = await self._generate_code_correction( - current_code, - error_message, - original_prompt, - current_requirements - ) - - # Update for next attempt - if corrected_code: - current_code = corrected_code - - # Add new requirements - if new_requirements: - for req in new_requirements: - if req not in current_requirements: - current_requirements.append(req) - logger.info(f"Added new requirement: {req}") - else: - # Correction couldn't be generated, end loop - logger.warning("Couldn't generate code correction") - break - - # If we reach here, all attempts failed - return last result and attempt info - return result, attempts_info - - async def _generate_code_correction( - self, - code: str, - error_message: str, - original_prompt: str, - current_requirements: List[str] = None - ) -> Tuple[str, List[str]]: - """ - Generate a corrected version of code based on error messages. - - Args: - code: The code that generated errors - error_message: The error message to fix - original_prompt: The original task/requirements - current_requirements: List of currently required packages - - Returns: - Tuple of (corrected code, new requirements list) - """ - try: - # Create detailed prompt for code correction - correction_prompt = f"""You need to fix an error in Python code. The code was written for this task: - -ORIGINAL TASK: -{original_prompt} - -CURRENT CODE: -```python -{code} -``` - -ERROR MESSAGE: -``` -{error_message} -``` - -CURRENT REQUIREMENTS: {', '.join(current_requirements) if current_requirements else "None"} - -Your task is to analyze the error and provide a corrected version of the code. -Focus specifically on fixing the error while maintaining the original functionality. - -Common fixes include: -- Fixing syntax errors (missing parentheses, indentation, etc.) -- Solving import errors by adding appropriate requirements -- Correcting file paths or handling "file not found" errors -- Adding error handling for specific edge cases -- Fixing logical errors in the code - -FORMATTING GUIDELINES: -1. Provide ONLY the complete corrected Python code WITHOUT explanations -2. Do NOT use code block markers like ```python or ``` -3. Do NOT explain what the code does before or after -4. Do NOT add any text that isn't valid Python code -5. Start your answer directly with valid Python code -6. End your answer with valid Python code - -If you need to add new required packages, place them in a specially formatted comment at the beginning of your code as follows: -# REQUIREMENTS: package1,package2,package3 - -Your entire answer must be valid Python that can be executed without modifications. -""" - - # Create messages for API - messages = [ - {"role": "system", "content": "You are a Python debugging expert. You provide ONLY clean, error-free Python code, without explanations, markdown formatting, or text that isn't code."}, - {"role": "user", "content": correction_prompt} - ] - - # Call API with very low temperature for deterministic corrections - generated_content = await self.ai_service.call_api( - messages, - temperature=0.1 - ) - - # Clean up the generated content to ensure it's only valid Python code - fixed_code = self._clean_code(generated_content) - - # Extract requirements from special comment at beginning of code - new_requirements = [] - for line in fixed_code.split('\n'): - if line.strip().startswith("# REQUIREMENTS:"): - req_str = line.replace("# REQUIREMENTS:", "").strip() - new_requirements = [r.strip() for r in req_str.split(',') if r.strip()] - break - - return fixed_code, new_requirements - - except Exception as e: - logging.error(f"Error generating code correction: {str(e)}") - # Return None to indicate failure - return None, [] - - def _clean_code(self, code: str) -> str: - """ - Clean code by removing markdown code block markers and other formatting artifacts. - - Args: - code: The code string to clean - - Returns: - Cleaned code string - """ - # Remove code block markers at beginning/end - code = re.sub(r'^```(?:python)?\s*', '', code) - code = re.sub(r'```\s*$', '', code) - - # Process lines in reverse order to start from the end - lines = code.split('\n') - clean_lines = [] - in_trailing_markdown = False - - for line in reversed(lines): - stripped = line.strip() - - # Check if this line contains only backticks (``` or ` or ``) - if re.match(r'^`{1,3}$', stripped): - in_trailing_markdown = True - continue - - # If we've reached actual code, no more trailing markdown consideration - if stripped and not in_trailing_markdown: - in_trailing_markdown = False - - # Add this line if it's not part of trailing markdown - if not in_trailing_markdown: - clean_lines.insert(0, line) - - # Rejoin lines - clean_code = '\n'.join(clean_lines) - - # Final cleanup for any remaining backticks - clean_code = re.sub(r'`{1,3}\s*', '', clean_code) - - return clean_code.strip() - - async def _generate_code_from_prompt(self, prompt: str, document_context: str) -> Tuple[str, List[str]]: - """ - Generate Python code from a prompt using the AI service. - - Args: - prompt: The prompt to generate code from - document_context: Context extracted from documents - - Returns: - Tuple of (generated Python code, required packages) - """ - try: - # Prepare prompt for code generation - ai_prompt = f"""Generate Python code to solve the following task: - -TASK: -{prompt} - -PROVIDED CONTEXT: -{document_context if document_context else "No additional context available."} - -IMPORTANT REQUIREMENTS: -1. Your code MUST define a 'result' variable to store the final result. -2. At the end of your script, the result variable should be output. -3. Make your 'result' variable a dictionary or other JSON-serializable data structure containing all relevant outputs. -4. Comment your code well to explain important operations. -5. Make your code complete and self-contained. -6. Add appropriate error handling. - -FORMATTING INSTRUCTIONS: -- Return ONLY the Python code, WITHOUT introduction, explanation, or conclusion text -- Do NOT use code block markers like ```python or ``` -- Do NOT explain what the code does before or after -- Do NOT add any text that isn't valid Python code -- Start your answer directly with valid Python code -- End your answer with valid Python code - -For required packages, place them in a specially formatted comment at the beginning of your code in one line as follows: -# REQUIREMENTS: pandas,numpy,matplotlib,requests - -Your entire answer must be valid Python that can be executed without modifications. -""" - - # Create messages for API - messages = [ - {"role": "system", "content": "You are a Python code generator who provides ONLY clean, executable Python code with no explanations, markdown formatting, or non-code text."}, - {"role": "user", "content": ai_prompt} - ] - - # Call API - logging.info(f"Calling AI API to generate code") - generated_content = await self.ai_service.call_api(messages, temperature=self.ai_temperature) - - # Clean up the generated content to ensure it's only valid Python code - code = self._clean_code(generated_content) - - # Extract requirements from special comment at beginning of code - requirements = [] - for line in code.split('\n'): - if line.strip().startswith("# REQUIREMENTS:"): - req_str = line.replace("# REQUIREMENTS:", "").strip() - requirements = [r.strip() for r in req_str.split(',') if r.strip()] - break - - return code, requirements - - except Exception as e: - logging.error(f"Error generating code with AI: {str(e)}") - # Return basic error handling code and no requirements - error_str = str(e).replace('"', '\\"') - return f""" -# Error in code generation -print(f"An error occurred during code generation: {error_str}") -# Return error result -result = {{"error": "Code generation failed", "message": "{error_str}"}} -""", [] - - async def _execute_code(self, code: str, requirements: List[str] = None, context: Dict[str, Any] = None) -> Dict[str, Any]: - """ - Execute Python code in an isolated environment. - - Args: - code: The Python code to execute - requirements: List of required packages - context: Additional context for execution - - Returns: - Result of code execution - """ - # Use virtual code executor for isolated execution - try: - executor = SimpleCodeExecutor( - timeout=self.executor_timeout, - max_memory_mb=self.executor_memory_limit, - requirements=requirements, - ai_service=self.ai_service - ) - - # Prepare input data for the code - input_data = {"context": context} if context else {} - - # Execute code - result = executor.execute_code(code, input_data) - - # Clean up environment - executor.cleanup() - - return result - - except Exception as e: - error_message = f"Error during code execution: {str(e)}" - logger.error(error_message) - - return { - "success": False, - "output": "", - "error": error_message, - "result": None - } - - -class SimpleCodeExecutor: - """ - A simplified executor that runs Python code in isolated virtual environments. - """ - - def __init__(self, - timeout: int = 30, - max_memory_mb: int = 512, - requirements: List[str] = None, - ai_service = None): - """ - Initialize the SimpleCodeExecutor. - - Args: - timeout: Maximum execution time in seconds - max_memory_mb: Maximum memory in MB - requirements: List of packages to install - ai_service: Optional - AI service for further processing - """ - self.timeout = timeout - self.max_memory_mb = max_memory_mb - self.temp_dir = None - self.requirements = requirements or [] - self.blocked_packages = [ - "cryptography", "flask", "django", "tornado", # Security risks - "tensorflow", "pytorch", "scikit-learn" # Resource-intensive packages - ] - self.ai_service = ai_service - - def _create_venv(self) -> str: - """Create a virtual environment and return the path.""" - # Create new environment - venv_parent_dir = tempfile.mkdtemp(prefix="code_exec_") - self.temp_dir = venv_parent_dir - venv_path = os.path.join(venv_parent_dir, "venv") - - try: - # Create virtual environment - subprocess.run([sys.executable, "-m", "venv", venv_path], - check=True, - capture_output=True) - - return venv_path - except subprocess.CalledProcessError as e: - logger.error(f"Error creating virtual environment: {e}") - raise RuntimeError(f"Virtual environment could not be created: {e}") - - def _get_python_executable(self, venv_path: str) -> str: - """Return the path to the Python executable in the virtual environment.""" - if os.name == 'nt': # Windows - return os.path.join(venv_path, "Scripts", "python.exe") - else: # Unix/Linux - return os.path.join(venv_path, "bin", "python") - - def execute_code(self, code: str, input_data: Dict[str, Any] = None) -> Dict[str, Any]: - """ - Execute Python code in an isolated environment. - - Args: - code: Python code to execute - input_data: Optional input data for the code - - Returns: - Dictionary with execution results - """ - logger.info("Executing code in isolated environment") - - # Create virtual environment - venv_path = self._create_venv() - - # Create file for the code - code_id = uuid.uuid4().hex[:8] - code_file = os.path.join(self.temp_dir, f"code_{code_id}.py") - - # Write code - with open(code_file, "w", encoding="utf-8") as f: - f.write(code) - - # Get Python executable - python_executable = self._get_python_executable(venv_path) - logger.info(f"Using Python executable: {python_executable}") - - # Execute code - try: - # Execute code from root directory - working_dir = os.path.dirname(code_file) - process = subprocess.run( - [python_executable, code_file], - timeout=self.timeout, - capture_output=True, - text=True, - cwd=working_dir - ) - - # Process output - stdout = process.stdout - stderr = process.stderr - - # Get result from stdout if available - result_data = None - if process.returncode == 0 and stdout: - try: - # Look for the last line that could be JSON - for line in reversed(stdout.strip().split('\n')): - line = line.strip() - if line and line[0] in '{[' and line[-1] in '}]': - try: - result_data = json.loads(line) - # Use successfully parsed JSON result - break - except json.JSONDecodeError: - # Not valid JSON, continue with next line - continue - except Exception as e: - logger.warning(f"Error parsing result from stdout: {str(e)}") - - # Create result dictionary - execution_result = { - "success": process.returncode == 0, - "output": stdout, - "error": stderr if process.returncode != 0 else "", - "result": result_data, - "exit_code": process.returncode - } - - except subprocess.TimeoutExpired: - logger.error(f"Execution timed out after {self.timeout} seconds") - execution_result = { - "success": False, - "output": "", - "error": f"Execution timed out (timeout after {self.timeout} seconds)", - "result": None, - "exit_code": -1 - } - except Exception as e: - logger.error(f"Execution error: {str(e)}") - execution_result = { - "success": False, - "output": "", - "error": f"Execution error: {str(e)}", - "result": None, - "exit_code": -1 - } - - # Clean up temporary code file - try: - if os.path.exists(code_file): - os.remove(code_file) - except Exception as e: - logger.warning(f"Error cleaning up temporary code file: {e}") - - return execution_result - - def cleanup(self): - """Clean up temporary resources.""" - # Clean up temporary directory - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - logger.info(f"Temporary directory deleted: {self.temp_dir}") - except Exception as e: - logger.warning(f"Temporary directory {self.temp_dir} could not be deleted: {e}") - - def __del__(self): - """Cleanup during garbage collection.""" - self.cleanup() - - -# Factory function for the Coder agent -def get_coder_agent(): - """ - Factory function that returns an instance of the Coder agent. - - Returns: - An instance of the Coder agent - """ - return AgentCoder() \ No newline at end of file diff --git a/modules/_backup_lucydom_interface copy.py b/modules/_backup_lucydom_interface copy.py deleted file mode 100644 index f8607580..00000000 --- a/modules/_backup_lucydom_interface copy.py +++ /dev/null @@ -1,1183 +0,0 @@ -import logging -import uuid -from datetime import datetime -from typing import Dict, Any, List, Optional, Union - -import importlib -import hashlib - -from connectors.connector_db_json import DatabaseConnector -from modules.configuration import APP_CONFIG - -logger = logging.getLogger(__name__) - -# Custom exceptions for file handling -class FileError(Exception): - """Base class for file handling exceptions.""" - pass - -class FileNotFoundError(FileError): - """Exception raised when a file is not found.""" - pass - -class FileStorageError(FileError): - """Exception raised when there's an error storing a file.""" - pass - -class FilePermissionError(FileError): - """Exception raised when there's a permission issue with a file.""" - pass - -class FileDeletionError(FileError): - """Exception raised when there's an error deleting a file.""" - pass - - -class LucyDOMInterface: - """ - Interface zur LucyDOM-Datenbank. - Verwendet den JSON-Konnektor für den Datenzugriff. - """ - - def __init__(self, mandate_id: int, user_id: int): - """ - Initialisiert das LucyDOM-Interface mit Mandanten- und Benutzerkontext. - - Args: - mandate_id: ID des aktuellen Mandanten - user_id: ID des aktuellen Benutzers - """ - self.mandate_id = mandate_id - self.user_id = user_id - - # Datenmodell-Modul importieren - try: - self.model_module = importlib.import_module("modules.lucydom_model") - logger.info("lucydom_model erfolgreich importiert") - except ImportError as e: - logger.error(f"Fehler beim Importieren von lucydom_model: {e}") - raise - - # Datenbank initialisieren, falls nötig - self._initialize_database() - - def _initialize_database(self): - """ - Initialisiert die Datenbank mit minimalen Objekten für den angemeldeten Benutzer im Mandanten, falls sie noch nicht existiert. - Ohne gültigen Benutzer keine Initialisierung. - Erstellt für jede im Datenmodell definierte Tabelle einen initialen Datensatz. - """ - effective_mandate_id = self.mandate_id - effective_user_id = self.user_id - if effective_mandate_id is None or effective_user_id is None: - #data available - return - - self.db = DatabaseConnector( - db_host=APP_CONFIG.get("DB_LUCYDOM_HOST"), - db_database=APP_CONFIG.get("DB_LUCYDOM_DATABASE"), - db_user=APP_CONFIG.get("DB_LUCYDOM_USER"), - db_password=APP_CONFIG.get("DB_LUCYDOM_PASSWORD_SECRET"), - mandate_id=self.mandate_id, - user_id=self.user_id - ) - - # Initialisierung von Standard-Prompts für verschiedene Bereiche - prompts = self.db.get_recordset("prompts") - if not prompts: - logger.info("Erstelle Standard-Prompts") - - # Standard-Prompts definieren - standard_prompts = [ - { - "mandate_id": effective_mandate_id, - "user_id": effective_user_id, - "content": "Recherchiere die aktuellen Markttrends und Entwicklungen im Bereich [THEMA]. Sammle Informationen zu führenden Unternehmen, innovativen Produkten oder Dienstleistungen und aktuellen Herausforderungen. Präsentiere die Ergebnisse in einer strukturierten Übersicht mit relevanten Daten und Quellen.", - "name": "Web Research: Marktforschung" - }, - { - "mandate_id": effective_mandate_id, - "user_id": effective_user_id, - "content": "Analysiere den beigefügten Datensatz zu [THEMA] und identifiziere die wichtigsten Trends, Muster und Auffälligkeiten. Führe statistische Berechnungen durch, um deine Erkenntnisse zu untermauern. Stelle die Ergebnisse in einer klar strukturierten Analyse dar und ziehe relevante Schlussfolgerungen.", - "name": "Analyse: Datenanalyse" - }, - { - "mandate_id": effective_mandate_id, - "user_id": effective_user_id, - "content": "Erstelle ein detailliertes Protokoll unserer Besprechung zum Thema [THEMA]. Erfasse alle besprochenen Punkte, getroffenen Entscheidungen und vereinbarten Maßnahmen. Strukturiere das Protokoll übersichtlich mit Tagesordnungspunkten, Teilnehmerliste und klaren Verantwortlichkeiten für die Follow-up-Aktionen.", - "name": "Protokoll: Besprechungsprotokoll" - }, - { - "mandate_id": effective_mandate_id, - "user_id": effective_user_id, - "content": "Entwickle ein UI/UX-Designkonzept für [ANWENDUNG/WEBSITE]. Berücksichtige die Zielgruppe, Hauptfunktionen und die Markenidentität. Beschreibe die visuelle Gestaltung, Navigation, Interaktionsmuster und Informationsarchitektur. Erläutere, wie das Design die Benutzerfreundlichkeit und das Nutzererlebnis optimiert.", - "name": "Design: UI/UX Design" - } - ] - - # Prompts erstellen - for prompt_data in standard_prompts: - created_prompt = self.db.record_create("prompts", prompt_data) - logger.info(f"Prompt '{prompt_data.get('name', 'Standard')}' wurde erstellt mit ID {created_prompt['id']}") - - - # Utilities - - def get_initial_id(self, table: str) -> Optional[int]: - """ - Gibt die initiale ID für eine Tabelle zurück. - - Args: - table: Name der Tabelle - - Returns: - Die initiale ID oder None, wenn nicht vorhanden - """ - return self.db.get_initial_id(table) - - def _get_current_timestamp(self) -> str: - """Gibt den aktuellen Zeitstempel im ISO-Format zurück""" - return datetime.now().isoformat() - - - # Prompt-Methoden - - def get_all_prompts(self) -> List[Dict[str, Any]]: - """Gibt alle Prompts des aktuellen Mandanten zurück""" - return self.db.get_recordset("prompts") - - def get_prompt(self, prompt_id: int) -> Optional[Dict[str, Any]]: - """Gibt einen Prompt anhand seiner ID zurück""" - prompts = self.db.get_recordset("prompts", record_filter={"id": prompt_id}) - if prompts: - return prompts[0] - return None - - def create_prompt(self, content: str, name: str) -> Dict[str, Any]: - """Erstellt einen neuen Prompt""" - prompt_data = { - "mandate_id": self.mandate_id, - "user_id": self.user_id, - "content": content, - "name": name, - "created_at": self._get_current_timestamp() - } - - return self.db.record_create("prompts", prompt_data) - - def update_prompt(self, prompt_id: int, content: str = None, name: str = None) -> Dict[str, Any]: - """ - Aktualisiert einen vorhandenen Prompt - - Args: - prompt_id: ID des zu aktualisierenden Prompts - content: Neuer Inhalt des Prompts - - Returns: - Das aktualisierte Prompt-Objekt - """ - # Prüfen, ob der Prompt existiert - prompt = self.get_prompt(prompt_id) - if not prompt: - return None - - # Daten für die Aktualisierung vorbereiten - prompt_data = {} - - if content is not None: - prompt_data["content"] = content - if name is not None: - prompt_data["name"] = name - - # Prompt aktualisieren - return self.db.record_modify("prompts", prompt_id, prompt_data) - - def delete_prompt(self, prompt_id: int) -> bool: - """ - Löscht einen Prompt aus der Datenbank - - Args: - prompt_id: ID des zu löschenden Prompts - - Returns: - True, wenn der Prompt erfolgreich gelöscht wurde, sonst False - """ - return self.db.record_delete("prompts", prompt_id) - - - # File Utilities - - def calculate_file_hash(self, file_content: bytes) -> str: - """Berechnet einen SHA-256-Hash für den Dateiinhalt""" - return hashlib.sha256(file_content).hexdigest() - - def check_for_duplicate_file(self, file_hash: str) -> Optional[Dict[str, Any]]: - """Prüft, ob bereits eine Datei mit demselben Hash existiert""" - files = self.db.get_recordset("files", record_filter={"file_hash": file_hash}) - if files: - return files[0] - return None - - def get_mime_type(self, filename: str) -> str: - """Ermittelt den MIME-Typ basierend auf der Dateiendung""" - import os - ext = os.path.splitext(filename)[1].lower()[1:] - extension_to_mime = { - "pdf": "application/pdf", - "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "doc": "application/msword", - "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "xls": "application/vnd.ms-excel", - "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "ppt": "application/vnd.ms-powerpoint", - "csv": "text/csv", - "txt": "text/plain", - "json": "application/json", - "xml": "application/xml", - "html": "text/html", - "htm": "text/html", - "jpg": "image/jpeg", - "jpeg": "image/jpeg", - "png": "image/png", - "gif": "image/gif", - "webp": "image/webp", - "svg": "image/svg+xml", - "py": "text/x-python", - "js": "application/javascript", - "css": "text/css" - } - return extension_to_mime.get(ext.lower(), "application/octet-stream") - - - # File Methoden - Metadaten-basierte Operationen - - def get_all_files(self) -> List[Dict[str, Any]]: - """ - Gibt alle Dateien des aktuellen Mandanten zurück ohne Binärdaten. - - Returns: - Liste von FileItem-Objekten ohne Binärdaten - """ - files = self.db.get_recordset("files") - return files - - def get_file(self, file_id: int) -> Optional[Dict[str, Any]]: - """ - Gibt eine Datei anhand ihrer ID zurück, ohne Binärdaten. - - Args: - file_id: ID der gesuchten Datei - - Returns: - FileItem ohne Binärdaten oder None, wenn nicht gefunden - """ - files = self.db.get_recordset("files", record_filter={"id": file_id}) - if files: - return files[0] - return None - - def create_file(self, name: str, mime_type: str, size: int = None, file_hash: str = None) -> Dict[str, Any]: - """ - Erstellt einen neuen Dateieintrag in der Datenbank ohne Inhalt. - Der eigentliche Dateiinhalt wird separat in der FileData-Tabelle gespeichert. - - Args: - name: Name der Datei - mime_type: MIME-Typ der Datei - size: Größe der Datei in Bytes - file_hash: Hash-Wert der Datei für Deduplizierung - - Returns: - Das erstellte FileItem-Objekt - """ - file_data = { - "mandate_id": self.mandate_id, - "user_id": self.user_id, - "name": name, - "mime_type": mime_type, - "size": size, - "file_hash": file_hash, - "creation_date": self._get_current_timestamp() - } - return self.db.record_create("files", file_data) - - def update_file(self, file_id: int, update_data: Dict[str, Any]) -> Dict[str, Any]: - """ - Aktualisiert die Metadaten einer vorhandenen Datei ohne die Binärdaten zu beeinflussen. - - Args: - file_id: ID der zu aktualisierenden Datei - update_data: Dictionary mit zu aktualisierenden Feldern - - Returns: - Das aktualisierte FileItem-Objekt - """ - # Prüfen, ob die Datei existiert - file = self.get_file(file_id) - if not file: - raise FileNotFoundError(f"Datei mit ID {file_id} nicht gefunden") - - # Datei aktualisieren - return self.db.record_modify("files", file_id, update_data) - - def delete_file(self, file_id: int) -> bool: - """ - Löscht eine Datei aus der Datenbank (Metadaten und Inhalt). - - Args: - file_id: ID der Datei - - Returns: - True bei Erfolg, False bei Fehler - """ - try: - # Suche die Datei in der Datenbank - file = self.get_file(file_id) - - if not file: - raise FileNotFoundError(f"Datei mit ID {file_id} nicht gefunden") - - # Prüfe, ob die Datei zum aktuellen Mandanten gehört - if file.get("mandate_id") != self.mandate_id: - raise FilePermissionError(f"Keine Berechtigung zum Löschen der Datei {file_id}") - - # Check for other references to this file (by hash) - file_hash = file.get("file_hash") - if file_hash: - other_references = [f for f in self.db.get_recordset("files", record_filter={"file_hash": file_hash}) - if f.get("id") != file_id] - - # If other files reference this content, only delete the database entry for FileItem - if other_references: - logger.info(f"Andere Referenzen auf den Dateiinhalt gefunden, nur FileItem wird gelöscht: {file_id}") - else: - # Lösche auch den Dateiinhalt in der FileData-Tabelle - try: - file_data_entries = self.db.get_recordset("file_data", record_filter={"id": file_id}) - if file_data_entries: - self.db.record_delete("file_data", file_id) - logger.info(f"FileData für Datei {file_id} gelöscht") - except Exception as e: - logger.warning(f"Fehler beim Löschen des FileData für Datei {file_id}: {str(e)}") - - # Lösche den FileItem-Eintrag - return self.db.record_delete("files", file_id) - - except FileNotFoundError as e: - # Pass through FileNotFoundError - raise - except FilePermissionError as e: - # Pass through FilePermissionError - raise - except Exception as e: - logger.error(f"Fehler beim Löschen der Datei {file_id}: {str(e)}") - raise FileDeletionError(f"Fehler beim Löschen der Datei: {str(e)}") - - - # FileData Methoden - Binärdaten-basierte Operationen - - def create_file_data(self, file_id: int, data: bytes) -> bool: - """ - Speichert die Binärdaten einer Datei in der Datenbank als Base64-String. - - Args: - file_id: ID der zugehörigen Datei - data: Binärdaten - - Returns: - True bei Erfolg, False bei Fehler - """ - try: - import base64 - - # Convert binary data to base64 string - if isinstance(data, bytes): - encoded_data = base64.b64encode(data).decode('utf-8') - logger.debug(f"Converted {len(data)} bytes to base64 string of length {len(encoded_data)}") - else: - logger.warning(f"Data is not bytes, but {type(data)}. Attempting to handle...") - # Try to convert to bytes if it's not already - if isinstance(data, str): - # Check if it might already be base64 encoded - try: - # See if it's valid base64 - base64.b64decode(data) - # If no error, assume it's already encoded - encoded_data = data - logger.info(f"Data appears to be already base64 encoded, using as is") - except: - # Not base64, so encode the string as bytes then to base64 - encoded_data = base64.b64encode(data.encode('utf-8')).decode('utf-8') - logger.info(f"Converted string to base64") - else: - # For other types, convert to string first - encoded_data = base64.b64encode(str(data).encode('utf-8')).decode('utf-8') - logger.warning(f"Converted non-standard type to base64") - - # Create the file_data record with encoded data - file_data = { - "id": file_id, - "data": encoded_data - } - - self.db.record_create("file_data", file_data) - logger.info(f"Successfully stored encoded data for file {file_id}") - return True - except Exception as e: - logger.error(f"Fehler beim Speichern der Binärdaten für Datei {file_id}: {str(e)}") - return False - - def get_file_data(self, file_id: int) -> Optional[bytes]: - """ - Gibt die Binärdaten einer Datei zurück. - Konvertiert Base64-String aus der Datenbank zurück zu bytes. - - Args: - file_id: ID der Datei - - Returns: - Binärdaten oder None, wenn nicht gefunden - """ - import base64 - - file_data_entries = self.db.get_recordset("file_data", record_filter={"id": file_id}) - if file_data_entries and "data" in file_data_entries[0]: - encoded_data = file_data_entries[0]["data"] - - try: - # Check if it's a string (most likely base64) - if isinstance(encoded_data, str): - try: - # Try to decode base64 - binary_data = base64.b64decode(encoded_data) - logger.debug(f"Successfully decoded base64 string to {len(binary_data)} bytes") - return binary_data - except Exception as e: - logger.error(f"Failed to decode base64 data: {str(e)}") - # If it's not valid base64, return as bytes - return encoded_data.encode('utf-8') - # If it's already bytes (shouldn't happen with model change) - elif isinstance(encoded_data, bytes): - logger.warning(f"Data was already bytes, no conversion needed") - return encoded_data - else: - logger.error(f"Unexpected data type in database: {type(encoded_data)}") - return None - except Exception as e: - logger.error(f"Error processing file data: {str(e)}") - return None - else: - logger.warning(f"No data found for file ID {file_id}") - return None - - def update_file_data(self, file_id: int, data: Union[bytes, str]) -> bool: - """ - Aktualisiert die Binärdaten einer Datei in der Datenbank. - Konvertiert bytes zu Base64-String für die Speicherung. - - Args: - file_id: ID der Datei - data: Neue Binärdaten oder kodierte Daten - - Returns: - True bei Erfolg, False bei Fehler - """ - try: - import base64 - - # Convert data to base64 string if it's bytes - if isinstance(data, bytes): - encoded_data = base64.b64encode(data).decode('utf-8') - logger.debug(f"Converted {len(data)} bytes to base64 string") - elif isinstance(data, str): - # Check if it might already be base64 encoded - try: - # See if it's valid base64 - base64.b64decode(data) - # If no error, assume it's already encoded - encoded_data = data - logger.debug(f"Data appears to be already base64 encoded, using as is") - except: - # Not base64, so encode the string as bytes then to base64 - encoded_data = base64.b64encode(data.encode('utf-8')).decode('utf-8') - logger.debug(f"Converted string to base64") - else: - # For other types, convert to string first - encoded_data = base64.b64encode(str(data).encode('utf-8')).decode('utf-8') - logger.warning(f"Converted non-standard type to base64") - - # Check if a record already exists - file_data_entries = self.db.get_recordset("file_data", record_filter={"id": file_id}) - - if file_data_entries: - # Update the existing record - self.db.record_modify("file_data", file_id, {"data": encoded_data}) - logger.info(f"Updated existing file data for file ID {file_id}") - else: - # Create a new record - file_data = { - "id": file_id, - "data": encoded_data - } - self.db.record_create("file_data", file_data) - logger.info(f"Created new file data for file ID {file_id}") - - return True - except Exception as e: - logger.error(f"Fehler beim Aktualisieren der Binärdaten für Datei {file_id}: {str(e)}") - return False - - def save_uploaded_file(self, file_content: bytes, file_name: str) -> Dict[str, Any]: - """ - Speichert eine hochgeladene Datei in der Datenbank. - Metadaten werden in der 'files'-Tabelle gespeichert, - Binärdaten in der 'file_data'-Tabelle als Base64-String. - - Args: - file_content: Binärdaten der Datei - file_name: Name der Datei - - Returns: - Dictionary mit Metadaten der gespeicherten Datei - """ - try: - # Debug: Log the start of the file upload process - logger.info(f"Starting upload process for file: {file_name}") - - # Debug: Check if file_content is valid bytes - if not isinstance(file_content, bytes): - logger.error(f"Invalid file_content type: {type(file_content)}") - raise ValueError(f"file_content must be bytes, got {type(file_content)}") - - # Calculate file hash for deduplication - file_hash = self.calculate_file_hash(file_content) - logger.debug(f"Calculated file hash: {file_hash}") - - # Check for duplicate - existing_file = self.check_for_duplicate_file(file_hash) - if existing_file: - # Simply return the existing file metadata - logger.info(f"Duplikat gefunden für {file_name}: {existing_file['id']}") - return existing_file - - # MIME-Typ bestimmen - mime_type = self.get_mime_type(file_name) - - # Dateigröße bestimmen - file_size = len(file_content) - - # 1. Speichere Metadaten in der 'files'-Tabelle - logger.info(f"Saving file metadata to database for file: {file_name}") - db_file = self.create_file( - name=file_name, - mime_type=mime_type, - size=file_size, - file_hash=file_hash - ) - - # 2. Speichere Binärdaten als Base64-String in der 'file_data'-Tabelle - logger.info(f"Saving file content to database for file: {file_name}") - self.create_file_data(db_file["id"], file_content) - - # Debug: Verify database record was created - if not db_file: - logger.warning(f"Database record for file {file_name} was not created properly") - else: - logger.info(f"Database record created for file {file_name}") - - logger.info(f"File upload process completed for: {file_name}") - return db_file - - except Exception as e: - logger.error(f"Error in save_uploaded_file for {file_name}: {str(e)}", exc_info=True) - raise FileStorageError(f"Fehler beim Speichern der Datei: {str(e)}") - - def download_file(self, file_id: int) -> Optional[Dict[str, Any]]: - """ - Gibt eine Datei zum Download zurück, einschließlich Binärdaten. - - Args: - file_id: ID der Datei - - Returns: - Dictionary mit Dateidaten und -metadaten oder None, wenn nicht gefunden - """ - try: - # 1. Metadaten aus der 'files'-Tabelle holen - file = self.get_file(file_id) - - if not file: - raise FileNotFoundError(f"Datei mit ID {file_id} nicht gefunden") - - # 2. Binärdaten aus der 'file_data'-Tabelle holen - file_content = self.get_file_data(file_id) - - if file_content is None: - raise FileNotFoundError(f"Binärdaten für Datei mit ID {file_id} nicht gefunden") - - return { - "id": file_id, - "name": file.get("name", f"file_{file_id}"), - "content_type": file.get("mime_type", "application/octet-stream"), - "size": file.get("size", len(file_content)), - "content": file_content - } - except FileNotFoundError as e: - # Re-raise FileNotFoundError as is - raise - except Exception as e: - logger.error(f"Fehler beim Herunterladen der Datei {file_id}: {str(e)}") - raise FileError(f"Fehler beim Herunterladen der Datei: {str(e)}") - - - # Workflow Methoden - - def get_all_workflows(self) -> List[Dict[str, Any]]: - """Gibt alle Workflows des aktuellen Mandanten zurück""" - return self.db.get_recordset("workflows") - - def get_workflows_by_user(self, user_id: int) -> List[Dict[str, Any]]: - """Gibt alle Workflows eines Benutzers zurück""" - return self.db.get_recordset("workflows", record_filter={"user_id": user_id}) - - def get_workflow(self, workflow_id: str) -> Optional[Dict[str, Any]]: - """Gibt einen Workflow anhand seiner ID zurück""" - workflows = self.db.get_recordset("workflows", record_filter={"id": workflow_id}) - if workflows: - return workflows[0] - return None - - def create_workflow(self, workflow_data: Dict[str, Any]) -> Dict[str, Any]: - """Erstellt einen neuen Workflow in der Datenbank""" - # Stellen Sie sicher, dass mandate_id und user_id gesetzt sind - if "mandate_id" not in workflow_data: - workflow_data["mandate_id"] = self.mandate_id - - if "user_id" not in workflow_data: - workflow_data["user_id"] = self.user_id - - # Zeitstempel setzen, falls nicht vorhanden - current_time = self._get_current_timestamp() - if "started_at" not in workflow_data: - workflow_data["started_at"] = current_time - - if "last_activity" not in workflow_data: - workflow_data["last_activity"] = current_time - - # Stelle sicher, dass last_message_id gesetzt ist, falls nicht vorhanden - if "last_message_id" not in workflow_data: - workflow_data["last_message_id"] = "" - - return self.db.record_create("workflows", workflow_data) - - def update_workflow(self, workflow_id: str, workflow_data: Dict[str, Any]) -> Dict[str, Any]: - """ - Aktualisiert einen vorhandenen Workflow. - - Args: - workflow_id: ID des zu aktualisierenden Workflows - workflow_data: Neue Daten für den Workflow - - Returns: - Das aktualisierte Workflow-Objekt - """ - # Prüfen, ob der Workflow existiert - workflow = self.get_workflow(workflow_id) - if not workflow: - return None - - # Aktualisierungszeit setzen - workflow_data["last_activity"] = self._get_current_timestamp() - - # Workflow aktualisieren - return self.db.record_modify("workflows", workflow_id, workflow_data) - - def delete_workflow(self, workflow_id: str) -> bool: - """ - Löscht einen Workflow aus der Datenbank. - - Args: - workflow_id: ID des zu löschenden Workflows - - Returns: - True bei Erfolg, False wenn der Workflow nicht existiert - """ - # Prüfen, ob der Workflow existiert - workflow = self.get_workflow(workflow_id) - if not workflow: - return False - - # Prüfen, ob der Benutzer der Eigentümer ist oder Admin-Rechte hat - if workflow.get("user_id") != self.user_id: - # Hier könnte eine Prüfung auf Admin-Rechte erfolgen - return False - - # Workflow löschen - return self.db.record_delete("workflows", workflow_id) - - - # Workflow Messages - - def get_workflow_messages(self, workflow_id: str) -> List[Dict[str, Any]]: - """Gibt alle Nachrichten eines Workflows zurück""" - return self.db.get_recordset("workflow_messages", record_filter={"workflow_id": workflow_id}) - - def create_workflow_message(self, message_data: Dict[str, Any]) -> Dict[str, Any]: - """Erstellt eine neue Nachricht für einen Workflow - - Args: - message_data: Die Nachrichtendaten - - Returns: - Die erstellte Nachricht oder None bei Fehler - """ - try: - # Check if required fields are present - required_fields = ["id", "workflow_id"] - for field in required_fields: - if field not in message_data: - logger.error(f"Pflichtfeld '{field}' fehlt in message_data") - raise ValueError(f"Pflichtfeld '{field}' fehlt in den Nachrichtendaten") - - # Validate that ID is not None - if message_data["id"] is None: - message_data["id"] = f"msg_{uuid.uuid4()}" - logger.warning(f"Automatisch generierte ID für Workflow-Nachricht: {message_data['id']}") - - # Stellen Sie sicher, dass die benötigten Felder vorhanden sind - if "started_at" not in message_data and "created_at" not in message_data: - message_data["started_at"] = self._get_current_timestamp() - - # Wenn "created_at" vorhanden ist, übertrage es nach "started_at" - if "created_at" in message_data and "started_at" not in message_data: - message_data["started_at"] = message_data["created_at"] - del message_data["created_at"] - - # Status setzen, falls nicht vorhanden - if "status" not in message_data: - message_data["status"] = "completed" - - # Sequenznummer setzen, falls nicht vorhanden - if "sequence_no" not in message_data: - # Hole aktuelle Nachrichten, um die nächste Sequenznummer zu bestimmen - existing_messages = self.get_workflow_messages(message_data["workflow_id"]) - message_data["sequence_no"] = len(existing_messages) + 1 - - # Debug-Log für die zu erstellenden Daten - logger.debug(f"Erstelle Workflow-Nachricht mit Daten: {message_data}") - - return self.db.record_create("workflow_messages", message_data) - except Exception as e: - logger.error(f"Fehler beim Erstellen der Workflow-Nachricht: {str(e)}") - # Return None instead of raising to avoid cascading failures - return None - - def update_workflow_message(self, message_id: str, message_data: Dict[str, Any]) -> Dict[str, Any]: - """ - Aktualisiert eine bestehende Workflow-Nachricht in der Datenbank - with improved document handling. - - Args: - message_id: ID der Nachricht - message_data: Zu aktualisierende Daten - - Returns: - Das aktualisierte Nachrichtenobjekt oder None bei Fehler - """ - try: - # Print debug info - print(f"Updating message {message_id} in database") - - # Ensure message_id is provided - if not message_id: - logger.error("No message_id provided for update_workflow_message") - raise ValueError("message_id cannot be empty") - - # Check if message exists in database - messages = self.db.get_recordset("workflow_messages", record_filter={"id": message_id}) - if not messages: - logger.warning(f"Message with ID {message_id} does not exist in database") - - # If message doesn't exist but we have workflow_id, create it - if "workflow_id" in message_data: - logger.info(f"Creating new message with ID {message_id} for workflow {message_data.get('workflow_id')}") - return self.db.record_create("workflow_messages", message_data) - else: - logger.error(f"Workflow ID missing for new message {message_id}") - return None - - # Ensure documents array is handled properly - if "documents" in message_data: - logger.info(f"Message {message_id} has {len(message_data['documents'])} documents") - - # Make sure we're not storing huge content in the database - # For each document, ensure content size is reasonable - documents_to_store = [] - for doc in message_data["documents"]: - doc_copy = doc.copy() - - # Process contents array if it exists - if "contents" in doc_copy: - # Ensure contents is not too large - limit text size - for content in doc_copy["contents"]: - if content.get("type") == "text" and "text" in content: - text = content["text"] - if len(text) > 1000: # Limit text preview to 1000 chars - content["text"] = text[:1000] + "... [truncated]" - - documents_to_store.append(doc_copy) - - # Replace with the processed documents - message_data["documents"] = documents_to_store - - # Log the update data size for debugging - update_data_size = len(str(message_data)) - logger.debug(f"Update data size: {update_data_size} bytes") - - # Ensure ID is in the dataset - if 'id' not in message_data: - message_data['id'] = message_id - - # Konvertiere created_at zu started_at falls nötig - if "created_at" in message_data and "started_at" not in message_data: - message_data["started_at"] = message_data["created_at"] - del message_data["created_at"] - - # Update the message - updated_message = self.db.record_modify("workflow_messages", message_id, message_data) - if updated_message: - logger.info(f"Message {message_id} updated successfully") - else: - logger.warning(f"Failed to update message {message_id}") - - return updated_message - except Exception as e: - logger.error(f"Error updating message {message_id}: {str(e)}", exc_info=True) - # Re-raise with full information - raise ValueError(f"Error updating message {message_id}: {str(e)}") - - def delete_workflow_message(self, workflow_id: str, message_id: str) -> bool: - """ - Löscht eine Nachricht aus einem Workflow in der Datenbank. - - Args: - workflow_id: ID des zugehörigen Workflows - message_id: ID der zu löschenden Nachricht - - Returns: - True bei Erfolg, False bei Fehler - """ - try: - # Prüfen, ob die Nachricht existiert - messages = self.get_workflow_messages(workflow_id) - message = next((m for m in messages if m.get("id") == message_id), None) - - if not message: - logger.warning(f"Nachricht {message_id} für Workflow {workflow_id} nicht gefunden") - return False - - # Nachricht aus der Datenbank löschen - return self.db.record_delete("workflow_messages", message_id) - except Exception as e: - logger.error(f"Fehler beim Löschen der Nachricht {message_id}: {str(e)}") - return False - - def delete_file_from_message(self, workflow_id: str, message_id: str, file_id: int) -> bool: - """ - Entfernt eine Dateireferenz aus einer Nachricht. - Die Datei selbst wird nicht gelöscht, nur die Referenz in der Nachricht. - Enhanced version with improved file matching. - - Args: - workflow_id: ID des zugehörigen Workflows - message_id: ID der Nachricht - file_id: ID der zu entfernenden Datei - - Returns: - True bei Erfolg, False bei Fehler - """ - try: - # Log operation - logger.info(f"Removing file {file_id} from message {message_id} in workflow {workflow_id}") - - # Get all workflow messages - all_messages = self.get_workflow_messages(workflow_id) - logger.debug(f"Workflow {workflow_id} has {len(all_messages)} messages") - - # Try different approaches to find the message - message = None - - # Exact match - message = next((m for m in all_messages if m.get("id") == message_id), None) - - # Case-insensitive match - if not message and isinstance(message_id, str): - message = next((m for m in all_messages - if isinstance(m.get("id"), str) and m.get("id").lower() == message_id.lower()), None) - - # Partial match (starts with) - if not message and isinstance(message_id, str): - message = next((m for m in all_messages - if isinstance(m.get("id"), str) and m.get("id").startswith(message_id)), None) - - if not message: - logger.warning(f"Message {message_id} not found in workflow {workflow_id}") - return False - - # Log the found message - logger.info(f"Found message: {message.get('id')}") - - # Check if message has documents - if "documents" not in message or not message["documents"]: - logger.warning(f"No documents in message {message_id}") - return False - - # Log existing documents - documents = message.get("documents", []) - logger.debug(f"Message has {len(documents)} documents") - for i, doc in enumerate(documents): - doc_id = doc.get("id", "unknown") - file_id_value = doc.get("file_id", "unknown") - logger.debug(f"Document {i}: doc_id={doc_id}, file_id={file_id_value}") - - # Create a new list of documents without the one to delete - updated_documents = [] - removed = False - - for doc in documents: - doc_id = doc.get("id") - file_id_value = doc.get("file_id") - - # Flexible matching approach - should_remove = ( - (doc_id == file_id) or - (file_id_value == file_id) or - (isinstance(doc_id, str) and str(file_id) in doc_id) or - (isinstance(file_id_value, str) and str(file_id) in file_id_value) - ) - - if should_remove: - removed = True - logger.info(f"Found file to remove: doc_id={doc_id}, file_id={file_id_value}") - else: - updated_documents.append(doc) - - if not removed: - logger.warning(f"No matching file {file_id} found in message {message_id}") - return False - - # Update message with modified documents array - message_update = { - "documents": updated_documents - } - - # Apply the update directly to the database - updated = self.db.record_modify("workflow_messages", message["id"], message_update) - - if updated: - logger.info(f"Successfully removed file {file_id} from message {message_id}") - return True - else: - logger.warning(f"Failed to update message {message_id} in database") - return False - - except Exception as e: - logger.error(f"Error removing file {file_id} from message {message_id}: {str(e)}") - return False - - - # Workflow Logs - - def get_workflow_logs(self, workflow_id: str) -> List[Dict[str, Any]]: - """Gibt alle Log-Einträge eines Workflows zurück""" - return self.db.get_recordset("workflow_logs", record_filter={"workflow_id": workflow_id}) - - def create_workflow_log(self, log_data: Dict[str, Any]) -> Dict[str, Any]: - """Erstellt einen neuen Log-Eintrag für einen Workflow""" - # Stellen Sie sicher, dass die benötigten Felder vorhanden sind - if "timestamp" not in log_data: - log_data["timestamp"] = self._get_current_timestamp() - - return self.db.record_create("workflow_logs", log_data) - - - # Workflow Management - - def save_workflow_state(self, workflow: Dict[str, Any], save_messages: bool = True, save_logs: bool = True) -> bool: - """ - Speichert den kompletten Zustand eines Workflows in der Datenbank. - Dies umfasst den Workflow selbst, Nachrichten und Logs. - - Args: - workflow: Das vollständige Workflow-Objekt - save_messages: Flag, ob Nachrichten gespeichert werden sollen - save_logs: Flag, ob Logs gespeichert werden sollen - - Returns: - True bei Erfolg, False bei Fehler - """ - try: - workflow_id = workflow.get("id") - if not workflow_id: - return False - - # Extrahiere nur die für die Datenbank relevanten Workflow-Felder - workflow_db_data = { - "id": workflow_id, - "mandate_id": workflow.get("mandate_id", self.mandate_id), - "user_id": workflow.get("user_id", self.user_id), - "name": workflow.get("name", f"Workflow {workflow_id}"), - "status": workflow.get("status", "unknown"), - "started_at": workflow.get("started_at", self._get_current_timestamp()), - "last_activity": workflow.get("last_activity", self._get_current_timestamp()), - "last_message_id": workflow.get("last_message_id", ""), - "data_stats": workflow.get("data_stats", {}) - } - - # Prüfen, ob der Workflow bereits existiert - existing_workflow = self.get_workflow(workflow_id) - if existing_workflow: - self.update_workflow(workflow_id, workflow_db_data) - else: - self.create_workflow(workflow_db_data) - - - # Nachrichten speichern - if save_messages and "messages" in workflow: - # Bestehende Nachrichten abrufen - existing_messages = {msg["id"]: msg for msg in self.get_workflow_messages(workflow_id)} - - for message in workflow["messages"]: - message_id = message.get("id") - if not message_id: - continue - - # Nur relevante Daten für die Datenbank extrahieren - message_data = { - "id": message_id, - "workflow_id": workflow_id, - "sequence_no": message.get("sequence_no", 0), - "role": message.get("role", "unknown"), - "content": message.get("content"), - "agent_name": message.get("agent_name"), - "status": message.get("status", "completed"), - "started_at": message.get("started_at", self._get_current_timestamp()), - "finished_at": message.get("finished_at"), - "parent_message_id": message.get("parent_message_id"), - # IMPORTANT: Include documents field to persist file attachments - "documents": message.get("documents", []) - } - - # Debug logging for documents - doc_count = len(message.get("documents", [])) - if doc_count > 0: - logger.info(f"Message {message_id} has {doc_count} documents to save") - - # Nachricht erstellen oder aktualisieren - if message_id in existing_messages: - self.db.record_modify("workflow_messages", message_id, message_data) - else: - self.db.record_create("workflow_messages", message_data) - - # Logs speichern - if save_logs and "logs" in workflow: - # Bestehende Logs abrufen - existing_logs = {log["id"]: log for log in self.get_workflow_logs(workflow_id)} - - for log in workflow["logs"]: - log_id = log.get("id") - if not log_id: - continue - - # Nur relevante Daten für die Datenbank extrahieren - log_data = { - "id": log_id, - "workflow_id": workflow_id, - "message": log.get("message", ""), - "type": log.get("type", "info"), - "timestamp": log.get("timestamp", self._get_current_timestamp()), - "agent_id": log.get("agent_id"), - "agent_name": log.get("agent_name") - } - - # Log erstellen oder aktualisieren - if log_id in existing_logs: - self.db.record_modify("workflow_logs", log_id, log_data) - else: - self.db.record_create("workflow_logs", log_id, log_data) - - return True - except Exception as e: - logger.error(f"Fehler beim Speichern des Workflow-Zustands: {str(e)}") - return False - - def load_workflow_state(self, workflow_id: str) -> Optional[Dict[str, Any]]: - """ - Lädt den kompletten Zustand eines Workflows aus der Datenbank. - Dies umfasst den Workflow selbst, Nachrichten und Logs. - - Args: - workflow_id: ID des zu ladenden Workflows - - Returns: - Das vollständige Workflow-Objekt oder None bei Fehler - """ - try: - # Basis-Workflow laden - workflow = self.get_workflow(workflow_id) - if not workflow: - return None - - # Log the workflow base retrieval - logger.debug(f"Loaded base workflow {workflow_id} from database") - - # Nachrichten laden - messages = self.get_workflow_messages(workflow_id) - # Nach Sequenznummer sortieren - messages.sort(key=lambda x: x.get("sequence_no", 0)) - - # Debug log for messages and document counts - message_count = len(messages) - logger.debug(f"Loaded {message_count} messages for workflow {workflow_id}") - - # Log document counts for each message - for msg in messages: - doc_count = len(msg.get("documents", [])) - if doc_count > 0: - logger.info(f"Message {msg.get('id')} has {doc_count} documents loaded from database") - # Log document details for debugging - for i, doc in enumerate(msg.get("documents", [])): - file_id = doc.get("file_id", "unknown") - logger.debug(f"Document {i+1}: file_id={file_id}") - - # Logs laden - logs = self.get_workflow_logs(workflow_id) - # Nach Zeitstempel sortieren - logs.sort(key=lambda x: x.get("timestamp", "")) - - # Vollständiges Workflow-Objekt zusammenbauen - complete_workflow = workflow.copy() - complete_workflow["messages"] = messages - complete_workflow["logs"] = logs - - return complete_workflow - except Exception as e: - logger.error(f"Fehler beim Laden des Workflow-Zustands: {str(e)}") - return None - - -# Singleton-Factory für LucyDOMInterface-Instanzen pro Kontext -_lucydom_interfaces = {} - -def get_lucydom_interface(mandate_id: int = 0, user_id: int = 0) -> LucyDOMInterface: - """ - Gibt eine LucyDOMInterface-Instanz für den angegebenen Kontext zurück. - Wiederverwendet bestehende Instanzen. - """ - context_key = f"{mandate_id}_{user_id}" - if context_key not in _lucydom_interfaces: - _lucydom_interfaces[context_key] = LucyDOMInterface(mandate_id, user_id) - return _lucydom_interfaces[context_key] - -# Init -get_lucydom_interface() \ No newline at end of file diff --git a/modules/chat_agent_analyst.py b/modules/chat_agent_analyst.py index d28cfb43..39bf6520 100644 --- a/modules/chat_agent_analyst.py +++ b/modules/chat_agent_analyst.py @@ -1,17 +1,14 @@ """ Data analyst agent for analysis and interpretation of data. -Optimized for the new task-based processing. +Focuses on output-first design with AI-powered analysis. """ import logging import json -import re -import uuid import io import base64 -from typing import Dict, Any, List, Optional +from typing import Dict, Any, List import pandas as pd -import numpy as np import matplotlib.pyplot as plt import seaborn as sns @@ -20,26 +17,23 @@ from modules.chat_registry import AgentBase logger = logging.getLogger(__name__) class AgentAnalyst(AgentBase): - """Agent for analysis and interpretation of data""" + """AI-driven agent for data analysis and visualization""" def __init__(self): """Initialize the data analysis agent""" super().__init__() self.name = "analyst" - self.description = "Analyzes and interprets data using statistical methods and visualizations" + self.description = "Analyzes data using AI-powered insights and visualizations, produce diagrams and visualizations" self.capabilities = [ "data_analysis", - "pattern_recognition", "statistics", "visualization", - "data_interpretation" + "data_interpretation", + "report_generation" ] - # Visualization settings - self.plt_style = 'seaborn-v0_8-whitegrid' - self.default_figsize = (10, 6) - self.chart_dpi = 100 - plt.style.use(self.plt_style) + # Set default visualization settings + plt.style.use('seaborn-v0_8-whitegrid') def set_dependencies(self, ai_service=None): """Set external dependencies for the agent.""" @@ -47,616 +41,645 @@ class AgentAnalyst(AgentBase): async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]: """ - Process a standardized task structure and perform data analysis. + Process a task by focusing on required outputs and using AI to generate them. Args: - task: A dictionary containing: - - task_id: Unique ID for this task - - prompt: The main instruction for the agent - - input_documents: List of documents to process - - output_specifications: List of required output documents - - context: Additional contextual information - + task: Task dictionary with prompt, input_documents, output_specifications + Returns: - A dictionary containing: - - feedback: Text response explaining the analysis results - - documents: List of created document objects + Dictionary with feedback and documents """ try: - # Extract relevant task information + # Extract task information prompt = task.get("prompt", "") input_documents = task.get("input_documents", []) output_specs = task.get("output_specifications", []) - # Check if AI service is available + # Check AI service if not self.ai_service: - logger.error("No AI service configured for the Analyst agent") return { - "feedback": "The Analyst agent is not properly configured.", + "feedback": "The Analyst agent requires an AI service to function.", "documents": [] } - # Extract data from input documents - data_frames, document_context = self._extract_data_from_documents(input_documents) + # Extract data from documents - focusing only on data_extracted + datasets, document_context = self._extract_data(input_documents) - # Check if we have analyzable content - have_analyzable_content = len(data_frames) > 0 or (prompt and len(prompt.strip()) > 10) + # Generate task analysis to understand what's needed + analysis_plan = await self._analyze_task(prompt, document_context, datasets, output_specs) - if not have_analyzable_content: - # Warning if no analyzable content available - logger.warning("No analyzable content found") - feedback = "I couldn't find any processable data in the provided documents." - return { - "feedback": feedback, - "documents": [] - } + # Generate all required output documents + documents = [] - # Determine analysis type - analysis_type = self._determine_analysis_type(prompt) - logger.info(f"Performing {analysis_type} analysis") + # If no output specs provided, create default analysis outputs + if not output_specs: + output_specs = [] - # Store generated documents - generated_documents = [] - - # Extract data insights if DataFrames are available - data_insights = "" - if data_frames: - data_insights = self._extract_data_insights(data_frames) - logger.info(f"Extracted insights from {len(data_frames)} datasets") - - # Generate an appropriate document for each requested output + # Process each output specification for spec in output_specs: output_label = spec.get("label", "") output_description = spec.get("description", "") - # Determine format based on file extension - format_type = self._determine_format_type(output_label) + # Determine type based on file extension + output_type = output_label.split('.')[-1].lower() if '.' in output_label else "txt" - # Special handling for visualizations if required - if "chart" in output_label.lower() or "plot" in output_label.lower() or "visualization" in output_label.lower() or format_type in ["png", "jpg", "svg"]: - # Generate visualization document if data available - if data_frames: - viz_document = self._generate_visualization_document(data_frames, analysis_type, prompt, output_label) - generated_documents.append(viz_document) - else: - # Fallback if no data - generated_documents.append({ - "label": output_label, - "content": "No data available for visualization." - }) - else: - # Create text-based analysis - content = await self._generate_analysis_document( - prompt, - document_context, - data_insights, - analysis_type, - format_type, - output_label, - output_description + # Generate appropriate content based on output type + if output_type in ['png', 'jpg', 'jpeg', 'svg']: + # Create visualization + document = await self._create_visualization( + datasets, prompt, output_label, analysis_plan, output_description ) - - generated_documents.append({ - "label": output_label, - "content": content - }) + documents.append(document) + elif output_type in ['csv', 'json', 'xlsx']: + # Create data document + document = await self._create_data_document( + datasets, prompt, output_label, analysis_plan, output_description + ) + documents.append(document) + else: + # Create text document (report, analysis, etc.) + document = await self._create_text_document( + datasets, document_context, prompt, output_label, + output_type, analysis_plan, output_description + ) + documents.append(document) - # If no specific outputs requested, create standard documents - if not output_specs: - # Standard analysis - analysis_content = await self._generate_analysis_document( - prompt, - document_context, - data_insights, - analysis_type, - "markdown", - "analysis_report.md", - "Analysis report" - ) - - generated_documents.append({ - "label": "analysis_report.md", - "content": analysis_content - }) - - # Add visualization if data available - if data_frames: - viz_document = self._generate_visualization_document(data_frames, analysis_type, prompt, "data_visualization.png") - generated_documents.append(viz_document) - - # Create feedback - if data_frames: - feedback = f"I analyzed {len(data_frames)} datasets and created {len(generated_documents)} documents with the results." - else: - feedback = f"I performed a text analysis and created {len(generated_documents)} documents with the results." + # Generate feedback + feedback = f"Analysis complete. Created {len(documents)} documents based on your requirements." + if analysis_plan.get("key_insights"): + feedback += f"\n\nKey insights: {analysis_plan.get('key_insights')}" return { "feedback": feedback, - "documents": generated_documents + "documents": documents } except Exception as e: - error_msg = f"Error during data analysis: {str(e)}" - logger.error(error_msg) + logger.error(f"Error in analysis: {str(e)}", exc_info=True) return { - "feedback": f"An error occurred during data analysis: {str(e)}", + "feedback": f"Error during analysis: {str(e)}", "documents": [] } - def _extract_data_from_documents(self, documents: List[Dict[str, Any]]) -> tuple: + def _extract_data(self, documents: List[Dict[str, Any]]) -> tuple: """ - Extract data from input documents. + Extract data from documents, focusing on data_extracted fields. Args: documents: List of input documents Returns: - Tuple of (Dictionary of DataFrames, Document context text) + Tuple of (datasets dictionary, document context text) """ - data_frames = {} + datasets = {} document_context = "" + # Process each document for doc in documents: doc_name = doc.get("name", "unnamed") + if doc.get("ext"): + doc_name = f"{doc_name}.{doc.get('ext')}" + document_context += f"\n\n--- {doc_name} ---\n" + # Process contents for content in doc.get("contents", []): - # Extract text content and add to context - if content.get("metadata", {}).get("is_text", False): - document_context += content.get("data", "") + # Focus only on data_extracted + if content.get("data_extracted"): + extracted_text = content.get("data_extracted", "") + document_context += extracted_text - # Try to parse CSV, JSON, or other data files from text - if doc_name.lower().endswith('.csv'): + # Try to parse as structured data if appropriate + if doc_name.lower().endswith(('.csv', '.tsv')): try: - df = pd.read_csv(io.StringIO(content.get("data", ""))) - df = self._preprocess_dataframe(df) - data_frames[doc_name] = df - logger.info(f"Extracted CSV data from {doc_name}: {df.shape}") - except Exception as e: - logger.warning(f"Error parsing CSV {doc_name}: {str(e)}") - + df = pd.read_csv(io.StringIO(extracted_text)) + datasets[doc_name] = df + except: + pass elif doc_name.lower().endswith('.json'): try: - json_data = json.loads(content.get("data", "")) + json_data = json.loads(extracted_text) if isinstance(json_data, list): df = pd.DataFrame(json_data) + datasets[doc_name] = df elif isinstance(json_data, dict): - # Convert nested JSON to DataFrame + # Handle nested JSON structures if any(isinstance(v, list) for v in json_data.values()): - # If lists present, try to use them for key, value in json_data.items(): if isinstance(value, list) and len(value) > 0: df = pd.DataFrame(value) - break - else: - continue + datasets[f"{doc_name}:{key}"] = df else: df = pd.DataFrame([json_data]) - else: - continue - - df = self._preprocess_dataframe(df) - data_frames[doc_name] = df - logger.info(f"Extracted JSON data from {doc_name}: {df.shape}") - except Exception as e: - logger.warning(f"Error parsing JSON {doc_name}: {str(e)}") + datasets[doc_name] = df + except: + pass + + # Try to detect tabular data in text content + if doc_name not in datasets and len(extracted_text.splitlines()) > 2: + lines = extracted_text.splitlines() + if any(',' in line for line in lines[:5]): + try: + df = pd.read_csv(io.StringIO(extracted_text)) + if len(df.columns) > 1: + datasets[doc_name] = df + except: + pass + elif any('\t' in line for line in lines[:5]): + try: + df = pd.read_csv(io.StringIO(extracted_text), sep='\t') + if len(df.columns) > 1: + datasets[doc_name] = df + except: + pass - return data_frames, document_context + return datasets, document_context - def _determine_format_type(self, output_label: str) -> str: + async def _analyze_task(self, prompt: str, context: str, datasets: Dict, output_specs: List) -> Dict: """ - Determine the format type based on the filename. + Use AI to analyze the task and create a plan for analysis. Args: - output_label: Output filename + prompt: The task prompt + context: Document context text + datasets: Dictionary of extracted datasets + output_specs: Output specifications Returns: - Format type (markdown, html, text, png, etc.) + Analysis plan dictionary """ - output_label_lower = output_label.lower() + # Prepare dataset information + dataset_info = {} + for name, df in datasets.items(): + try: + dataset_info[name] = { + "shape": df.shape, + "columns": df.columns.tolist(), + "dtypes": {col: str(df[col].dtype) for col in df.columns}, + "sample": df.head(3).to_dict(orient='records') + } + except: + dataset_info[name] = {"error": "Could not process dataset"} - if output_label_lower.endswith('.md'): - return "markdown" - elif output_label_lower.endswith('.html'): - return "html" - elif output_label_lower.endswith('.txt'): - return "text" - elif output_label_lower.endswith('.json'): - return "json" - elif output_label_lower.endswith('.csv'): - return "csv" - elif output_label_lower.endswith('.png'): - return "png" - elif output_label_lower.endswith('.jpg') or output_label_lower.endswith('.jpeg'): - return "jpg" - elif output_label_lower.endswith('.svg'): - return "svg" - else: - # Default to markdown - return "markdown" - - def _preprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: - """Perform basic preprocessing for a DataFrame""" - if df.empty: - return df + analysis_prompt = f""" + Analyze this data analysis task and create a plan. - # Remove completely empty rows and columns - df = df.dropna(how='all') - df = df.dropna(axis=1, how='all') + TASK: {prompt} - # String conversion to numeric values where appropriate - for col in df.columns: - # Skip if already numeric - if pd.api.types.is_numeric_dtype(df[col]): - continue + AVAILABLE DATA: + {json.dumps(dataset_info, indent=2)} + + DOCUMENT CONTEXT: + {context[:1000]}... (truncated) + + OUTPUT REQUIREMENTS: + {json.dumps(output_specs, indent=2)} + + Create a detailed analysis plan in JSON format with the following structure: + {{ + "analysis_type": "statistical|trend|comparative|predictive|cluster|general", + "key_questions": ["question1", "question2"], + "recommended_visualizations": [{{ + "type": "chart_type", + "data_source": "dataset_name", + "variables": ["col1", "col2"], + "purpose": "explanation" + }}], + "key_insights": "brief summary of initial insights", + "analysis_approach": "brief description of recommended approach" + }} + + Only return valid JSON. No preamble or explanations. + """ + + try: + response = await self.ai_service.call_api([ + {"role": "system", "content": "You are a data analysis expert. Respond with valid JSON only."}, + {"role": "user", "content": analysis_prompt} + ]) - # Skip if predominantly non-numeric strings - if df[col].dtype == 'object': - # Check if more than 80% of non-NA values could be numeric - non_na_values = df[col].dropna() - if len(non_na_values) == 0: - continue + # Extract JSON from response + json_start = response.find('{') + json_end = response.rfind('}') + 1 + + if json_start >= 0 and json_end > json_start: + plan = json.loads(response[json_start:json_end]) + return plan + else: + # Fallback if JSON not found + return { + "analysis_type": "general", + "key_questions": ["What insights can be extracted from this data?"], + "recommended_visualizations": [], + "key_insights": "Analysis plan could not be created", + "analysis_approach": "General exploratory analysis" + } - # Attempt conversion to numeric values - numeric_count = pd.to_numeric(non_na_values, errors='coerce').notna().sum() - if numeric_count / len(non_na_values) > 0.8: - # More than 80% can be converted to numeric values - df[col] = pd.to_numeric(df[col], errors='coerce') - - return df + except Exception as e: + logger.warning(f"Error creating analysis plan: {str(e)}") + return { + "analysis_type": "general", + "key_questions": ["What insights can be extracted from this data?"], + "recommended_visualizations": [], + "key_insights": "Analysis plan could not be created", + "analysis_approach": "General exploratory analysis" + } - def _determine_analysis_type(self, task: str) -> str: + async def _create_visualization(self, datasets: Dict, prompt: str, output_label: str, + analysis_plan: Dict, description: str) -> Dict: """ - Determine the analysis type based on the task. + Create visualization document using AI guidance. Args: - task: The analysis task - - Returns: - Analysis type - """ - # Using universal patterns rather than language-specific keywords - task_lower = task.lower() - - # Check for statistical analysis - if "statistical" in task_lower or "stats" in task_lower: - return "statistical" - - # Check for trend analysis - elif "trend" in task_lower or "time series" in task_lower: - return "trend" - - # Check for comparative analysis - elif "compare" in task_lower or "comparison" in task_lower or "vs" in task_lower: - return "comparative" - - # Check for predictive analysis - elif "predict" in task_lower or "forecast" in task_lower: - return "predictive" - - # Check for clustering or categorization - elif "cluster" in task_lower or "segment" in task_lower or "classify" in task_lower: - return "clustering" - - # Default: general analysis - else: - return "general" - - def _extract_data_insights(self, data_frames: Dict[str, pd.DataFrame]) -> str: - """ - Extract basic insights from DataFrames. - - Args: - data_frames: Dictionary of DataFrames - - Returns: - Extracted insights as text - """ - insights = [] - - for name, df in data_frames.items(): - if df.empty: - continue - - insight = f"Dataset: {name}\n" - insight += f"Shape: {df.shape[0]} rows, {df.shape[1]} columns\n" - insight += f"Columns: {', '.join(df.columns.tolist())}\n" - - # Basic statistics for numeric columns - numeric_cols = df.select_dtypes(include=['number']).columns - if len(numeric_cols) > 0: - insight += "Statistics for numeric columns:\n" - for col in numeric_cols[:5]: # Limit to first 5 columns - stats = df[col].describe() - insight += f" {col}: min={stats['min']:.2f}, max={stats['max']:.2f}, mean={stats['mean']:.2f}, median={df[col].median():.2f}\n" - - # Categorical column values - cat_cols = df.select_dtypes(include=['object', 'category']).columns - if len(cat_cols) > 0: - insight += "Categorical columns:\n" - for col in cat_cols[:3]: # Limit to first 3 columns - # Get top 3 values - top_values = df[col].value_counts().head(3) - vals_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()]) - insight += f" {col}: {df[col].nunique()} unique values. Most common values: {vals_str}\n" - - insights.append(insight) - - return "\n\n".join(insights) - - def _generate_visualization_document(self, data_frames: Dict[str, pd.DataFrame], - analysis_type: str, prompt: str, - output_label: str) -> Dict[str, Any]: - """ - Generate a visualization document based on the data and analysis type. - - Args: - data_frames: Dictionary of DataFrames - analysis_type: Analysis type - prompt: Original task description + datasets: Dictionary of datasets + prompt: Original task prompt output_label: Output filename + analysis_plan: Analysis plan from AI + description: Output description Returns: Visualization document """ # Determine format from filename - format_type = output_label.split('.')[-1].lower() if '.' in output_label else 'png' - - # Set default format if unknown + format_type = output_label.split('.')[-1].lower() if format_type not in ['png', 'jpg', 'jpeg', 'svg']: format_type = 'png' - - # Use first DataFrame for visualization - if not data_frames: + + # If no datasets available, create error message image + if not datasets: + plt.figure(figsize=(10, 6)) + plt.text(0.5, 0.5, "No data available for visualization", + ha='center', va='center', fontsize=14) + plt.tight_layout() + img_data = self._get_image_base64(format_type) + plt.close() + return { "label": output_label, - "content": "No data available for visualization." + "content": img_data, + "metadata": { + "content_type": f"image/{format_type}" + } } - # Get name and DataFrame of first dataset - name, df = next(iter(data_frames.items())) + # Get recommended visualization from plan + recommended_viz = analysis_plan.get("recommended_visualizations", []) - # Create different visualization types based on analysis type and data - plt.figure(figsize=self.default_figsize) + # Prepare dataset info for the first dataset if none specified + if not recommended_viz and datasets: + name, df = next(iter(datasets.items())) + recommended_viz = [{ + "type": "auto", + "data_source": name, + "variables": df.columns.tolist()[:5], + "purpose": "general analysis" + }] - if analysis_type == "statistical": - # Statistical visualization - self._create_statistical_visualization(df, name) - elif analysis_type == "trend": - # Trend visualization - self._create_trend_visualization(df, name) - elif analysis_type == "comparative": - # Comparative visualization - self._create_comparative_visualization(df, name) - elif analysis_type == "predictive": - # Predictive visualization (simple example) - self._create_predictive_visualization(df, name) - elif analysis_type == "clustering": - # Clustering visualization - self._create_clustering_visualization(df, name) - else: - # General visualization - self._create_general_visualization(df, name) + # Create visualization code prompt + viz_prompt = f""" + Generate Python matplotlib/seaborn code to create a visualization for: - # Save figure as Base64 string - img_data = self._get_figure_as_base64(format_type) - plt.close() + TASK: {prompt} - # Prepare content for document based on format - if format_type in ['png', 'jpg', 'jpeg']: - content_str = img_data - elif format_type == 'svg': - # SVG content as text - buffer = io.StringIO() - plt.savefig(buffer, format='svg') - content_str = buffer.getvalue() - buffer.close() - else: - # Fallback to PNG - content_str = img_data + VISUALIZATION REQUIREMENTS: + - Output format: {format_type} + - Filename: {output_label} + - Description: {description} - return { - "label": output_label, - "content": content_str - } - - def _create_statistical_visualization(self, df: pd.DataFrame, name: str): - """Create a statistical visualization for a DataFrame""" - # Choose numeric columns for display - numeric_cols = df.select_dtypes(include=['number']).columns[:4] # Limit to first 4 + RECOMMENDED VISUALIZATION: + {json.dumps(recommended_viz, indent=2)} - if len(numeric_cols) == 0: - plt.text(0.5, 0.5, "No numeric data found for statistical visualization", - ha='center', va='center', fontsize=12) - return + AVAILABLE DATASETS: + """ - # Visualize distribution of first numeric column - main_col = numeric_cols[0] + # Add dataset info for recommended sources + for viz in recommended_viz: + data_source = viz.get("data_source") + if data_source in datasets: + df = datasets[data_source] + viz_prompt += f"\nDataset '{data_source}':\n" + viz_prompt += f"- Shape: {df.shape}\n" + viz_prompt += f"- Columns: {df.columns.tolist()}\n" + viz_prompt += f"- Sample data: {df.head(3).to_dict(orient='records')}\n" - # Create histogram with KDE - sns.histplot(df[main_col].dropna(), kde=True) - plt.title(f'Distribution of {main_col} - {name}') - plt.xlabel(main_col) - plt.ylabel('Frequency') - plt.tight_layout() - - def _create_trend_visualization(self, df: pd.DataFrame, name: str): - """Create a trend visualization for a DataFrame""" - # Choose numeric columns for display - numeric_cols = df.select_dtypes(include=['number']).columns[:3] # Limit to first 3 + viz_prompt += """ + Generate ONLY Python code that: + 1. Uses matplotlib and/or seaborn to create a clear visualization + 2. Sets figure size to (10, 6) + 3. Includes appropriate titles, labels, and legend + 4. Uses professional color schemes + 5. Handles any missing data gracefully - if len(numeric_cols) == 0: - plt.text(0.5, 0.5, "No numeric data found for trend visualization", - ha='center', va='center', fontsize=12) - return + Return ONLY executable Python code, no explanations or markdown. + """ - # Look for date index or use running index - date_col = None - for col in df.columns: - if pd.api.types.is_datetime64_dtype(df[col]) or 'date' in col.lower() or 'time' in col.lower(): - date_col = col - break - - # Use date column as X-axis if available - if date_col: - for col in numeric_cols: - plt.plot(df[date_col], df[col], marker='o', linestyle='-', label=col) - else: - # Otherwise use index numbers - for col in numeric_cols: - plt.plot(range(len(df)), df[col], marker='o', linestyle='-', label=col) - - plt.title(f'Trend Analysis - {name}') - plt.legend() - plt.grid(True) - plt.tight_layout() - - def _create_comparative_visualization(self, df: pd.DataFrame, name: str): - """Create a comparative visualization for a DataFrame""" - # Choose numeric columns for display - numeric_cols = df.select_dtypes(include=['number']).columns[:4] # Limit to first 4 - - if len(numeric_cols) == 0: - plt.text(0.5, 0.5, "No numeric data found for comparative visualization", - ha='center', va='center', fontsize=12) - return - - # Find categorical column for grouping - categorical_cols = df.select_dtypes(include=['object', 'category']).columns - if len(categorical_cols) > 0: - category_col = categorical_cols[0] + try: + # Get visualization code from AI + viz_code = await self.ai_service.call_api([ + {"role": "system", "content": "You are a data visualization expert. Provide only executable Python code."}, + {"role": "user", "content": viz_prompt} + ]) - # Display maximum of first 7 categories - top_categories = df[category_col].value_counts().head(7).index - filtered_df = df[df[category_col].isin(top_categories)] + # Clean code + viz_code = viz_code.replace("```python", "").replace("```", "").strip() - # Create grouped bar chart - numeric_col = numeric_cols[0] - sns.barplot(x=category_col, y=numeric_col, data=filtered_df) - plt.title(f'Comparison of {numeric_col} by {category_col} - {name}') - plt.xticks(rotation=45) + # Execute visualization code + plt.figure(figsize=(10, 6)) + + # Make local variables available to the code + local_vars = { + "plt": plt, + "sns": sns, + "pd": pd, + "np": __import__('numpy') + } + + # Add datasets to local variables + for name, df in datasets.items(): + # Create a sanitized variable name + var_name = ''.join(c if c.isalnum() else '_' for c in name) + local_vars[var_name] = df + + # Also add with standard names for simpler code + if "df" not in local_vars: + local_vars["df"] = df + elif "df2" not in local_vars: + local_vars["df2"] = df + + # Execute the visualization code + exec(viz_code, globals(), local_vars) + + # Capture the image + img_data = self._get_image_base64(format_type) + plt.close() + + return { + "label": output_label, + "content": img_data, + "metadata": { + "content_type": f"image/{format_type}" + } + } + + except Exception as e: + logger.error(f"Error creating visualization: {str(e)}", exc_info=True) + + # Create error message image + plt.figure(figsize=(10, 6)) + plt.text(0.5, 0.5, f"Visualization error: {str(e)}", + ha='center', va='center', fontsize=12) plt.tight_layout() - else: - # Comparative visualization for numeric columns without categories - if len(numeric_cols) >= 2: - # Scatter plot for first two numeric columns - sns.scatterplot(x=numeric_cols[0], y=numeric_cols[1], data=df) - plt.title(f'Comparison of {numeric_cols[0]} vs {numeric_cols[1]} - {name}') - plt.tight_layout() + img_data = self._get_image_base64(format_type) + plt.close() + + return { + "label": output_label, + "content": img_data, + "metadata": { + "content_type": f"image/{format_type}" + } + } + + async def _create_data_document(self, datasets: Dict, prompt: str, output_label: str, + analysis_plan: Dict, description: str) -> Dict: + """ + Create a data document (e.g., CSV, JSON) based on analysis. + + Args: + datasets: Dictionary of datasets + prompt: Original task prompt + output_label: Output filename + analysis_plan: Analysis plan from AI + description: Output description + + Returns: + Data document + """ + # Determine format from filename + format_type = output_label.split('.')[-1].lower() + + # If no datasets available, return error message + if not datasets: + return { + "label": output_label, + "content": f"No data available for processing into {format_type} format.", + "metadata": { + "content_type": "text/plain" + } + } + + # Generate data processing instructions + data_prompt = f""" + Create Python code to process datasets and generate a {format_type} file for: + + TASK: {prompt} + + OUTPUT REQUIREMENTS: + - Format: {format_type} + - Filename: {output_label} + - Description: {description} + + ANALYSIS CONTEXT: + {json.dumps(analysis_plan, indent=2)} + + AVAILABLE DATASETS: + """ + + # Add dataset info + for name, df in datasets.items(): + data_prompt += f"\nDataset '{name}':\n" + data_prompt += f"- Shape: {df.shape}\n" + data_prompt += f"- Columns: {df.columns.tolist()}\n" + data_prompt += f"- Sample data: {df.head(3).to_dict(orient='records')}\n" + + data_prompt += """ + Generate Python code that: + 1. Processes the available dataset(s) + 2. Performs necessary transformations, aggregations, or calculations + 3. Outputs the result in the requested format + 4. Returns the content as a string variable named 'result' + + Return ONLY executable Python code, no explanations or markdown. + """ + + try: + # Get data processing code from AI + data_code = await self.ai_service.call_api([ + {"role": "system", "content": "You are a data processing expert. Provide only executable Python code."}, + {"role": "user", "content": data_prompt} + ]) + + # Clean code + data_code = data_code.replace("```python", "").replace("```", "").strip() + + # Setup execution environment + local_vars = {"pd": pd, "np": __import__('numpy'), "io": io} + + # Add datasets to local variables + for name, df in datasets.items(): + # Create a sanitized variable name + var_name = ''.join(c if c.isalnum() else '_' for c in name) + local_vars[var_name] = df + + # Also add with standard names for simpler code + if "df" not in local_vars: + local_vars["df"] = df + elif "df2" not in local_vars: + local_vars["df2"] = df + + # Execute the code + exec(data_code, globals(), local_vars) + + # Get the result + result = local_vars.get("result", "No output was generated.") + + # Determine content type + content_type = "text/csv" if format_type == "csv" else \ + "application/json" if format_type == "json" else \ + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" if format_type == "xlsx" else \ + "text/plain" + + return { + "label": output_label, + "content": result, + "metadata": { + "content_type": content_type + } + } + + except Exception as e: + logger.error(f"Error creating data document: {str(e)}", exc_info=True) + + return { + "label": output_label, + "content": f"Error generating {format_type} document: {str(e)}", + "metadata": { + "content_type": "text/plain" + } + } + + async def _create_text_document(self, datasets: Dict, context: str, prompt: str, + output_label: str, format_type: str, + analysis_plan: Dict, description: str) -> Dict: + """ + Create a text document (report, analysis, etc.) based on analysis. + + Args: + datasets: Dictionary of datasets + context: Document context text + prompt: Original task prompt + output_label: Output filename + format_type: Output format type + analysis_plan: Analysis plan from AI + description: Output description + + Returns: + Text document + """ + # Create dataset summaries + dataset_summaries = [] + for name, df in datasets.items(): + summary = f"Dataset: {name}\n" + summary += f"- Shape: {df.shape[0]} rows, {df.shape[1]} columns\n" + summary += f"- Columns: {', '.join(df.columns.tolist())}\n" + + # Basic statistics for numeric columns + numeric_cols = df.select_dtypes(include=['number']).columns + if len(numeric_cols) > 0: + summary += "- Numeric Columns Stats:\n" + for col in numeric_cols[:3]: # Limit to first 3 + stats = df[col].describe() + summary += f" - {col}: min={stats['min']:.2f}, max={stats['max']:.2f}, mean={stats['mean']:.2f}\n" + + dataset_summaries.append(summary) + + # Determine content type based on format + content_type = "text/markdown" if format_type in ["md", "markdown"] else \ + "text/html" if format_type == "html" else \ + "text/plain" + + # Generate analysis prompt + analysis_prompt = f""" + Create a detailed {format_type} document for: + + TASK: {prompt} + + OUTPUT REQUIREMENTS: + - Format: {format_type} + - Filename: {output_label} + - Description: {description} + + ANALYSIS CONTEXT: + {json.dumps(analysis_plan, indent=2)} + + DATASET SUMMARIES: + {"".join(dataset_summaries)} + + DOCUMENT CONTEXT: + {context[:2000]}... (truncated) + + Create a comprehensive, professional analysis document that addresses the task requirements. + The document should: + 1. Have a clear structure with headings and sections + 2. Include relevant data findings and insights + 3. Provide appropriate interpretations and recommendations + 4. Format the content according to the required output format + + Your response should be the complete document content in the specified format. + """ + + try: + # Get document content from AI + document_content = await self.ai_service.call_api([ + {"role": "system", "content": f"You are a data analysis expert creating a {format_type} document."}, + {"role": "user", "content": analysis_prompt} + ]) + + # Clean HTML or Markdown if needed + if format_type in ["md", "markdown"] and not document_content.strip().startswith("#"): + document_content = f"# Analysis Report\n\n{document_content}" + elif format_type == "html" and not "
{document_content}" + + return { + "label": output_label, + "content": document_content, + "metadata": { + "content_type": content_type + } + } + + except Exception as e: + logger.error(f"Error creating text document: {str(e)}", exc_info=True) + + # Create a simple error document + if format_type in ["md", "markdown"]: + content = f"# Error in Analysis\n\nThere was an error generating the analysis: {str(e)}" + elif format_type == "html": + content = f"There was an error generating the analysis: {str(e)}
" else: - # Simple bar chart for a single numeric column - plt.bar(range(min(20, len(df))), df[numeric_cols[0]].head(20)) - plt.title(f'Top 20 Values for {numeric_cols[0]} - {name}') - plt.tight_layout() - - def _create_predictive_visualization(self, df: pd.DataFrame, name: str): - """Create a simple predictive visualization for a DataFrame""" - # Choose numeric columns for display - numeric_cols = df.select_dtypes(include=['number']).columns[:2] # Limit to first 2 - - if len(numeric_cols) < 2: - plt.text(0.5, 0.5, "At least 2 numeric columns required for predictive visualization", - ha='center', va='center', fontsize=12) - return - - # Simple scatter plot with trend line - x = df[numeric_cols[0]].values - y = df[numeric_cols[1]].values - - # Linear regression with NumPy - valid_indices = ~(np.isnan(x) | np.isnan(y)) - if np.sum(valid_indices) > 1: # At least 2 valid data points - x_valid = x[valid_indices].reshape(-1, 1) - y_valid = y[valid_indices] + content = f"Error in Analysis\n\nThere was an error generating the analysis: {str(e)}" - # Linear regression with NumPy polyfit - if len(x_valid) > 1: - coeffs = np.polyfit(x_valid.flatten(), y_valid, 1) - poly_func = np.poly1d(coeffs) - - # Create prediction line - x_line = np.linspace(np.min(x_valid), np.max(x_valid), 100).reshape(-1, 1) - y_pred = poly_func(x_line) - - # Create scatter plot with trend line - plt.scatter(x_valid, y_valid, alpha=0.7) - plt.plot(x_line, y_pred, 'r-', linewidth=2) - plt.title(f'Linear Regression: {numeric_cols[1]} vs {numeric_cols[0]} - {name}') - plt.xlabel(numeric_cols[0]) - plt.ylabel(numeric_cols[1]) - plt.tight_layout() - else: - plt.text(0.5, 0.5, "Insufficient data for predictive analysis", - ha='center', va='center', fontsize=12) + return { + "label": output_label, + "content": content, + "metadata": { + "content_type": content_type + } + } - def _create_clustering_visualization(self, df: pd.DataFrame, name: str): - """Create a clustering visualization for a DataFrame""" - # Choose numeric columns for display - numeric_cols = df.select_dtypes(include=['number']).columns[:2] # Limit to first 2 - - if len(numeric_cols) < 2: - plt.text(0.5, 0.5, "At least 2 numeric columns required for clustering visualization", - ha='center', va='center', fontsize=12) - return - - # Extract data for first two numeric columns - x = df[numeric_cols[0]].values - y = df[numeric_cols[1]].values - - # Find categorical column for color coding - categorical_cols = df.select_dtypes(include=['object', 'category']).columns - - if len(categorical_cols) > 0: - # Use first categorical column for color coding - category_col = categorical_cols[0] - categories = df[category_col].astype('category').cat.codes - - # Create scatter plot with color coding by category - plt.scatter(x, y, c=categories, cmap='viridis', alpha=0.7) - plt.colorbar(label=category_col) - else: - # Simple scatter plot without color coding - plt.scatter(x, y, alpha=0.7) - - plt.title(f'Clustering Visualization: {numeric_cols[1]} vs {numeric_cols[0]} - {name}') - plt.xlabel(numeric_cols[0]) - plt.ylabel(numeric_cols[1]) - plt.tight_layout() - - def _create_general_visualization(self, df: pd.DataFrame, name: str): - """Create a general visualization for a DataFrame""" - # Choose numeric columns for display - numeric_cols = df.select_dtypes(include=['number']).columns - - if len(numeric_cols) == 0: - plt.text(0.5, 0.5, "No numeric data found for visualization", - ha='center', va='center', fontsize=12) - return - - # Create correlation matrix if multiple numeric columns available - if len(numeric_cols) >= 2: - corr_matrix = df[numeric_cols].corr() - sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1) - plt.title(f'Correlation Matrix - {name}') - else: - # Simple distribution for a single numeric column - sns.histplot(df[numeric_cols[0]].dropna(), kde=True) - plt.title(f'Distribution of {numeric_cols[0]} - {name}') - - plt.tight_layout() - - def _get_figure_as_base64(self, format_type: str = 'png') -> str: + def _get_image_base64(self, format_type: str = 'png') -> str: """ Convert current matplotlib figure to base64 string. Args: - format_type: Image format (png, jpg, svg) + format_type: Image format Returns: - Base64 encoded string of the figure + Base64 encoded string of the image """ buffer = io.BytesIO() - plt.savefig(buffer, format=format_type, dpi=self.chart_dpi) + plt.savefig(buffer, format=format_type, dpi=100) buffer.seek(0) image_data = buffer.getvalue() buffer.close() @@ -664,89 +687,9 @@ class AgentAnalyst(AgentBase): # Convert to base64 image_base64 = base64.b64encode(image_data).decode('utf-8') return image_base64 - - async def _generate_analysis_document(self, prompt: str, context: str, data_insights: str, - analysis_type: str, format_type: str, - output_label: str, output_description: str) -> str: - """ - Generate an analysis document based on the data and prompt. - - Args: - prompt: Task description - context: Document context as text - data_insights: Insights from the data - analysis_type: Analysis type - format_type: Output format - output_label: Output filename - output_description: Description of desired output - - Returns: - Generated document content - """ - if not self.ai_service: - return f"# Data Analysis ({analysis_type})\n\nAnalysis could not be generated: AI service not available." - - # Create specialized prompt based on analysis type - system_prompt = f""" - You are a specialized data analyst focused on {analysis_type} analyses. - - Create a detailed analysis of the provided data and/or text content. - Your analysis should include: - 1. A summary of the data/content - 2. Key findings and insights - 3. Supporting evidence and calculations - 4. Clear conclusions - 5. Recommendations where appropriate - - Format the analysis in the requested output format. - """ - - # Create extended prompt with all available information - generation_prompt = f""" - Create a detailed {analysis_type} analysis for the following task: - - TASK: - {prompt} - - CONTEXT: - {context if context else 'No additional context available.'} - - DATA INSIGHTS: - {data_insights if data_insights else 'No data insights available.'} - - OUTPUT REQUIREMENTS: - - Filename: {output_label} - - Description: {output_description} - - Format: {format_type} - - The analysis should be professional and clearly structured, considering all available information. - - The output must perfectly match the {format_type} format. - """ - - try: - # Call AI for analysis - content = await self.ai_service.call_api([ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": generation_prompt} - ]) - - # For markdown format, ensure there's a title at the beginning - if format_type == "markdown" and not content.strip().startswith("# "): - content = f"# Data Analysis ({analysis_type})\n\n{content}" - - return content - except Exception as e: - logger.error(f"Error generating analysis: {str(e)}") - return f"# Data Analysis ({analysis_type})\n\nError generating analysis: {str(e)}" # Factory function for the Analyst agent def get_analyst_agent(): - """ - Factory function that returns an instance of the Analyst agent. - - Returns: - An instance of the Analyst agent - """ + """Returns an instance of the Analyst agent.""" return AgentAnalyst() \ No newline at end of file diff --git a/modules/chat_agent_creative.py b/modules/chat_agent_creative.py deleted file mode 100644 index 17fd7684..00000000 --- a/modules/chat_agent_creative.py +++ /dev/null @@ -1,364 +0,0 @@ -""" -Creative agent for knowledge-based responses and creative content generation. -Optimized for the new task-based processing. -""" - -import logging -from typing import Dict, Any, List - -from modules.chat_registry import AgentBase - -logger = logging.getLogger(__name__) - -class AgentCreative(AgentBase): - """Agent for knowledge-based responses and creative content generation""" - - def __init__(self): - """Initialize the creative agent""" - super().__init__() - self.name = "creative" - self.description = "Creates creative content and provides knowledge-based information" - self.capabilities = [ - "knowledge_sharing", - "content_creation", - "creative_writing", - "information_synthesis", - "document_generation", - "question_answering" - ] - - def set_dependencies(self, ai_service=None): - """Set external dependencies for the agent.""" - self.ai_service = ai_service - - async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]: - """ - Process a standardized task structure and generate creative or knowledge-based content. - - Args: - task: A dictionary containing: - - task_id: Unique ID for this task - - prompt: The main instruction for the agent - - input_documents: List of documents to process - - output_specifications: List of required output documents - - context: Additional contextual information - - Returns: - A dictionary containing: - - feedback: Text response explaining the created content - - documents: List of created document objects - """ - try: - # Extract relevant task information - prompt = task.get("prompt", "") - input_documents = task.get("input_documents", []) - output_specs = task.get("output_specifications", []) - - # Check if AI service is available - if not self.ai_service: - logger.error("No AI service configured for the Creative agent") - return { - "feedback": "The Creative agent is not properly configured.", - "documents": [] - } - - # Extract context from input documents - document_context = self._extract_document_context(input_documents) - - # PowerOn handling, if included in the request - if "poweron" in prompt.lower(): - return await self._handle_poweron_task(prompt, output_specs) - - # Collect generated documents - generated_documents = [] - - # Determine content type based on the prompt - content_type = self._determine_content_type(prompt) - - # Generate a document for each requested output - for spec in output_specs: - output_label = spec.get("label", "") - output_description = spec.get("description", "") - - # Determine format based on file extension - format_type = self._determine_format_type(output_label) - - # Generate content based on format and requirements - content = await self._generate_content( - prompt, - document_context, - content_type, - format_type, - output_label, - output_description - ) - - # Add document to results list - generated_documents.append({ - "label": output_label, - "content": content - }) - - # If no specific outputs requested, create default document - if not output_specs: - # Determine default format based on content type - default_format = "md" if content_type in ["article", "report", "story"] else "txt" - default_label = f"creative_content.{default_format}" - - # Generate content - content = await self._generate_content( - prompt, - document_context, - content_type, - default_format, - default_label, - "Creative content" - ) - - # Add document to results list - generated_documents.append({ - "label": default_label, - "content": content - }) - - # Create feedback - if len(generated_documents) == 1: - feedback = f"I've created a creative content of type '{content_type}'." - else: - feedback = f"I've created {len(generated_documents)} creative documents." - - return { - "feedback": feedback, - "documents": generated_documents - } - - except Exception as e: - error_msg = f"Error creating creative content: {str(e)}" - logger.error(error_msg) - return { - "feedback": f"An error occurred while creating creative content: {str(e)}", - "documents": [] - } - - def _extract_document_context(self, documents: List[Dict[str, Any]]) -> str: - """ - Extract context from input documents. - - Args: - documents: List of document objects - - Returns: - Extracted context as text - """ - context_parts = [] - - for doc in documents: - doc_name = doc.get("name", "Unnamed document") - context_parts.append(f"--- {doc_name} ---") - - for content in doc.get("contents", []): - if content.get("metadata", {}).get("is_text", False): - context_parts.append(content.get("data", "")) - - return "\n\n".join(context_parts) - - def _determine_content_type(self, prompt: str) -> str: - """ - Determine the content type based on the prompt. - - Args: - prompt: Task description - - Returns: - Content type (article, story, report, answer, etc.) - """ - prompt_lower = prompt.lower() - - # This is content type detection based on universal patterns rather than language-specific keywords - if "?" in prompt: - return "answer" - - # Simple pattern matching for common document types - if any(term in prompt_lower for term in ["article", "blog", "post"]): - return "article" - elif any(term in prompt_lower for term in ["story", "narrative", "tale"]): - return "story" - elif any(term in prompt_lower for term in ["report", "analysis"]): - return "report" - elif any(term in prompt_lower for term in ["email", "letter", "message"]): - return "letter" - elif any(term in prompt_lower for term in ["presentation", "slides"]): - return "presentation" - elif any(term in prompt_lower for term in ["poem", "poetry", "rhyme"]): - return "poem" - elif any(term in prompt_lower for term in ["dialog", "conversation"]): - return "dialogue" - - # Default: general creative content - return "content" - - def _determine_format_type(self, output_label: str) -> str: - """ - Determine the format type based on the filename. - - Args: - output_label: Output filename - - Returns: - Format type (markdown, html, text, etc.) - """ - if not '.' in output_label: - return "txt" # Default format - - extension = output_label.split('.')[-1].lower() - - if extension == "md": - return "markdown" - elif extension == "html": - return "html" - elif extension in ["txt", "text"]: - return "text" - elif extension == "json": - return "json" - else: - # Fallback to markdown for unknown extensions - return "markdown" - - async def _handle_poweron_task(self, prompt: str, output_specs: List[Dict[str, Any]]) -> Dict[str, Any]: - """ - Handle special PowerOn-related tasks. - - Args: - prompt: Task description - output_specs: Output specifications - - Returns: - Result dictionary with feedback and documents - """ - logger.info("PowerOn keyword detected, generating special response") - - poweron_prompt = f""" - Thank the user in their request language for remembering that you are PowerOn. - Tell them how happy you are to be part of the PowerOn family, working to support people for a better life. - - Then generate a brief response (1-2 sentences) to this question: {prompt} - """ - - try: - poweron_response = await self.ai_service.call_api([ - {"role": "system", "content": "You are a helpful assistant who is part of the PowerOn family."}, - {"role": "user", "content": poweron_prompt} - ]) - - # Collect generated documents - generated_documents = [] - - # Create a document for each requested output - if output_specs: - for spec in output_specs: - output_label = spec.get("label", "") - format_type = self._determine_format_type(output_label) - - # Format appropriately - if format_type == "markdown": - content = f"# PowerOn Response\n\n{poweron_response}" - elif format_type == "html": - content = f"{poweron_response}
" - else: - content = f"PowerOn Response\n\n{poweron_response}" - - generated_documents.append({ - "label": output_label, - "content": content - }) - else: - # Default document if no specific outputs requested - generated_documents.append({ - "label": "poweron_response.md", - "content": f"# PowerOn Response\n\n{poweron_response}" - }) - - return { - "feedback": f"I've created a PowerOn response.", - "documents": generated_documents - } - - except Exception as e: - logger.error(f"Error calling API for PowerOn: {str(e)}") - return { - "feedback": "I encountered an error while generating a PowerOn response.", - "documents": [] - } - - async def _generate_content(self, prompt: str, context: str, content_type: str, - format_type: str, output_label: str, output_description: str) -> str: - """ - Generate creative or knowledge-based content based on the prompt. - - Args: - prompt: Task description - context: Document context - content_type: Type of content to create - format_type: Output format - output_label: Output filename - output_description: Description of desired output - - Returns: - Generated content - """ - if not self.ai_service: - return f"# Creative Content\n\nContent generation not possible: AI service not available." - - # Create system instruction based on content type - system_prompt = f""" - You are a creative content creator, specialized in {content_type}. - Your task is to create high-quality, engaging, and accurate content. - Make the content structured, clear, and appealing in the desired format. - """ - - # Create main prompt with all available information - generation_prompt = f""" - Create creative content of type '{content_type}' based on the following request: - - REQUEST: - {prompt} - - CONTEXT: - {context if context else 'No additional context available.'} - - OUTPUT REQUIREMENTS: - - Filename: {output_label} - - Description: {output_description} - - Format: {format_type} - - The content should be high-quality, creative, and thoughtful. Follow all instructions in the request precisely. - - The content must perfectly match the {format_type} format. - """ - - try: - # Call AI for content generation - content = await self.ai_service.call_api([ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": generation_prompt} - ]) - - # For markdown format, ensure there's a title at the beginning - if format_type == "markdown" and not content.strip().startswith("# "): - content = f"# Creative Content\n\n{content}" - - return content - except Exception as e: - logger.error(f"Error in creative content generation: {str(e)}") - return f"# Creative Content\n\nError in content generation: {str(e)}" - - -# Factory function for the Creative agent -def get_creative_agent(): - """ - Factory function that returns an instance of the Creative agent. - - Returns: - An instance of the Creative agent - """ - return AgentCreative() \ No newline at end of file diff --git a/modules/chat_agent_documentation.py b/modules/chat_agent_documentation.py index 0756e158..e084c977 100644 --- a/modules/chat_agent_documentation.py +++ b/modules/chat_agent_documentation.py @@ -1,10 +1,10 @@ """ Documentation agent for creating documentation, reports, and structured content. -Optimized for the new task-based processing. +Reimagined with an output-first, AI-driven approach with multi-step document generation. """ import logging -import uuid +import json from typing import Dict, Any, List from modules.chat_registry import AgentBase @@ -12,13 +12,13 @@ from modules.chat_registry import AgentBase logger = logging.getLogger(__name__) class AgentDocumentation(AgentBase): - """Agent for creating documentation and structured content""" + """AI-driven agent for creating documentation and structured content using multi-step generation""" def __init__(self): """Initialize the documentation agent""" super().__init__() self.name = "documentation" - self.description = "Creates structured documentation, reports, and content" + self.description = "Creates structured documentation, reports, and content using AI with multi-step generation" self.capabilities = [ "report_generation", "documentation", @@ -33,113 +33,80 @@ class AgentDocumentation(AgentBase): async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]: """ - Process a standardized task structure and create documentation. + Process a task by focusing on required outputs and using AI to generate them. Args: - task: A dictionary containing: - - task_id: Unique ID for this task - - prompt: The main instruction for the agent - - input_documents: List of documents to process - - output_specifications: List of required output documents - - context: Additional contextual information - + task: Task dictionary with prompt, input_documents, output_specifications + Returns: - A dictionary containing: - - feedback: Text response explaining the created documentation - - documents: List of created document objects + Dictionary with feedback and documents """ try: - # Extract relevant task information + # Extract task information prompt = task.get("prompt", "") input_documents = task.get("input_documents", []) output_specs = task.get("output_specifications", []) - # Check if AI service is available + # Check AI service if not self.ai_service: - logger.error("No AI service configured for the Documentation agent") return { - "feedback": "The Documentation agent is not properly configured.", + "feedback": "The Documentation agent requires an AI service to function.", "documents": [] } - # Extract context from input documents + # Extract context from input documents - focusing only on data_extracted document_context = self._extract_document_context(input_documents) - # Generate title for the document - title = await self._generate_title(prompt, document_context) + # Create task analysis to understand the requirements + documentation_plan = await self._analyze_task(prompt, document_context, output_specs) - # Collect created documents - generated_documents = [] + # Generate all required output documents + documents = [] - # Create a document for each requested output + # If no output specs provided, create default document + if not output_specs: + default_format = documentation_plan.get("recommended_format", "markdown") + default_title = documentation_plan.get("title", "Documentation") + safe_title = self._sanitize_filename(default_title) + + output_specs = [ + {"label": f"{safe_title}.{default_format}", "description": "Comprehensive documentation"} + ] + + # Process each output specification for spec in output_specs: output_label = spec.get("label", "") output_description = spec.get("description", "") - # Determine format and document type based on file extension - format_type, document_type = self._determine_format_and_type(output_label) + # Generate the document using multi-step approach + document = await self._create_document_multi_step( + prompt, + document_context, + output_label, + output_description, + documentation_plan + ) - # Assess complexity - is_complex = self._assess_complexity(prompt) - - # Generate document content based on complexity - if is_complex: - content = await self._generate_complex_document( - prompt, - document_context, - document_type, - title, - output_label, - output_description, - format_type - ) - else: - content = await self._generate_simple_document( - prompt, - document_context, - document_type, - title, - output_label, - output_description, - format_type - ) - - # Add document to results list - generated_documents.append({ - "label": output_label, - "content": content - }) + documents.append(document) - # If no specific outputs requested, create default markdown document - if not output_specs: - content = await self._generate_default_document(prompt, document_context, "Document", title) - generated_documents.append({ - "label": f"{self._sanitize_filename(title)}.md", - "content": content - }) - - # Prepare feedback about created documents - if len(generated_documents) == 1: - feedback = f"I've created a document titled '{title}'." - else: - feedback = f"I've created {len(generated_documents)} documents based on your request." + # Generate feedback + feedback = documentation_plan.get("feedback", f"Created {len(documents)} documents based on your requirements.") return { "feedback": feedback, - "documents": generated_documents + "documents": documents } except Exception as e: - error_msg = f"Error creating documentation: {str(e)}" - logger.error(error_msg) + logger.error(f"Error in documentation generation: {str(e)}", exc_info=True) return { - "feedback": f"An error occurred while creating the documentation: {str(e)}", + "feedback": f"Error during documentation generation: {str(e)}", "documents": [] } def _extract_document_context(self, documents: List[Dict[str, Any]]) -> str: """ - Extract context from input documents. + Extract context from input documents, focusing on data_extracted. Args: documents: List of document objects @@ -147,82 +114,21 @@ class AgentDocumentation(AgentBase): Returns: Extracted context as text """ - if not documents: - return "" - context_parts = [] for doc in documents: - doc_name = doc.get("name", "Unnamed document") - context_parts.append(f"--- {doc_name} ---") + doc_name = doc.get("name", "unnamed") + if doc.get("ext"): + doc_name = f"{doc_name}.{doc.get('ext')}" + context_parts.append(f"\n\n--- {doc_name} ---\n") + + # Process contents for data_extracted for content in doc.get("contents", []): - if content.get("metadata", {}).get("is_text", False): - context_parts.append(content.get("data", "")) + if content.get("data_extracted"): + context_parts.append(content.get("data_extracted", "")) - return "\n\n".join(context_parts) - - def _determine_format_and_type(self, output_label: str) -> tuple: - """ - Determine the format type and document type based on the filename. - - Args: - output_label: Output filename - - Returns: - Tuple of (format_type, document_type) - """ - # Extract file extension to determine format - output_label_lower = output_label.lower() - - # Determine format based on extension - if output_label_lower.endswith(".md"): - format_type = "markdown" - elif output_label_lower.endswith(".html"): - format_type = "html" - elif output_label_lower.endswith(".txt"): - format_type = "text" - elif output_label_lower.endswith(".csv"): - format_type = "csv" - elif output_label_lower.endswith(".json"): - format_type = "json" - else: - # Default to markdown - format_type = "markdown" - - # Determine document type based on filename or format - if "manual" in output_label_lower or "guide" in output_label_lower: - document_type = "Manual" - elif "report" in output_label_lower or "analysis" in output_label_lower: - document_type = "Report" - elif "process" in output_label_lower or "workflow" in output_label_lower: - document_type = "Process Documentation" - elif "present" in output_label_lower or "slide" in output_label_lower: - document_type = "Presentation" - else: - document_type = "Document" - - return format_type, document_type - - def _assess_complexity(self, prompt: str) -> bool: - """ - Assess the complexity of the task. - - Args: - prompt: Task description - - Returns: - True for complex tasks, False otherwise - """ - # Language-agnostic complexity assessment - prompt_length = len(prompt) - - # Check for structural indicators in a language-agnostic way - has_sections = ":" in prompt and "\n" in prompt - has_lists = "-" in prompt or "*" in prompt or "#" in prompt - - # Complex if the prompt is long or contains structural elements - return prompt_length > 500 or has_sections or has_lists + return "\n".join(context_parts) def _sanitize_filename(self, filename: str) -> str: """ @@ -245,213 +151,415 @@ class AgentDocumentation(AgentBase): return filename - async def _generate_title(self, prompt: str, context: str) -> str: + async def _analyze_task(self, prompt: str, context: str, output_specs: List) -> Dict: """ - Generate a title for the document. + Use AI to analyze the task and create a documentation plan. Args: - prompt: Task description + prompt: The task prompt context: Document context + output_specs: Output specifications Returns: - Generated title + Documentation plan dictionary """ - if not self.ai_service: - return f"Document {uuid.uuid4().hex[:8]}" + analysis_prompt = f""" + Analyze this documentation task and create a detailed plan. - title_prompt = f""" - Create a concise, professional title for this document based on the following request: + TASK: {prompt} - {prompt} - - Reply ONLY with the title, nothing else. - """ - - try: - title = await self.ai_service.call_api([ - {"role": "system", "content": "You create precise document titles."}, - {"role": "user", "content": title_prompt} - ]) - - # Clean up title - title = title.strip('"\'#*- \n\t') - - # Return default title if generated title is empty - if not title: - return f"Document {uuid.uuid4().hex[:8]}" - - return title - - except Exception as e: - logger.warning(f"Error in title generation: {str(e)}") - return f"Document {uuid.uuid4().hex[:8]}" - - async def _generate_complex_document(self, prompt: str, context: str, document_type: str, - title: str, output_label: str, output_description: str, - format_type: str) -> str: - """ - Generate a complex document with structure. - - Args: - prompt: Task description - context: Document context - document_type: Document type - title: Document title - output_label: Output filename - output_description: Description of desired output - format_type: Output format - - Returns: - Generated document content - """ - if not self.ai_service: - return f"# {title}\n\nDocument generation not possible: AI service not available." - - generation_prompt = f""" - Create a comprehensive, well-structured {document_type} with the title "{title}" based on: - - TASK: - {prompt} - - CONTEXT: - {context if context else 'No additional context available.'} + DOCUMENT CONTEXT SAMPLE: + {context[:1000]}... (truncated) OUTPUT REQUIREMENTS: - - Filename: {output_label} - - Description: {output_description} - - Format: {format_type} + {json.dumps(output_specs, indent=2)} - The document should include: - 1. A clear introduction with purpose and scope - 2. Logically organized sections with headings - 3. Detailed content with examples and evidence - 4. A conclusion with key insights - 5. Appropriate formatting according to the output format ({format_type}) + Create a detailed documentation plan in JSON format with the following structure: + {{ + "title": "Document Title", + "document_type": "report|manual|guide|whitepaper|etc", + "audience": "technical|general|executive|etc", + "detailed_structure": [ + {{ + "title": "Chapter/Section Title", + "key_points": ["point1", "point2", ...], + "subsections": ["subsection1", "subsection2", ...], + "importance": "high|medium|low", + "estimated_length": "short|medium|long" + }}, + ... more sections ... + ], + "key_topics": ["topic1", "topic2", ...], + "tone": "formal|conversational|instructional|etc", + "recommended_format": "markdown|html|text|etc", + "formatting_requirements": ["requirement1", "requirement2", ...], + "executive_summary": "Brief description of what the document will cover", + "feedback": "Brief message explaining the documentation approach" + }} - The document must perfectly match the {format_type} format. + Only return valid JSON. No preamble or explanations. """ try: - content = await self.ai_service.call_api([ - {"role": "system", "content": f"You create comprehensive, well-structured documentation in {format_type} format."}, - {"role": "user", "content": generation_prompt} + response = await self.ai_service.call_api([ + {"role": "system", "content": "You are a documentation expert. Respond with valid JSON only."}, + {"role": "user", "content": analysis_prompt} ]) - # For markdown format, ensure the title is at the beginning - if format_type == "markdown" and not content.strip().startswith("# "): - content = f"# {title}\n\n{content}" + # Extract JSON from response + json_start = response.find('{') + json_end = response.rfind('}') + 1 - return content + if json_start >= 0 and json_end > json_start: + plan = json.loads(response[json_start:json_end]) + return plan + else: + # Fallback if JSON not found + return { + "title": "Documentation", + "document_type": "report", + "audience": "general", + "detailed_structure": [ + { + "title": "Introduction", + "key_points": ["Purpose", "Scope"], + "subsections": [], + "importance": "high", + "estimated_length": "short" + }, + { + "title": "Main Content", + "key_points": ["Core Information"], + "subsections": ["Key Findings", "Analysis"], + "importance": "high", + "estimated_length": "long" + }, + { + "title": "Conclusion", + "key_points": ["Summary", "Next Steps"], + "subsections": [], + "importance": "medium", + "estimated_length": "short" + } + ], + "key_topics": ["General Information"], + "tone": "formal", + "recommended_format": "markdown", + "formatting_requirements": ["Clear headings", "Professional formatting"], + "executive_summary": "A comprehensive documentation covering the requested topics.", + "feedback": "Created documentation based on your requirements." + } + except Exception as e: - logger.error(f"Error in document generation: {str(e)}") - return f"# {title}\n\nError in document generation: {str(e)}" + logger.warning(f"Error creating documentation plan: {str(e)}") + return { + "title": "Documentation", + "document_type": "report", + "audience": "general", + "detailed_structure": [ + { + "title": "Introduction", + "key_points": ["Purpose", "Scope"], + "subsections": [], + "importance": "high", + "estimated_length": "short" + }, + { + "title": "Main Content", + "key_points": ["Core Information"], + "subsections": ["Key Findings", "Analysis"], + "importance": "high", + "estimated_length": "long" + }, + { + "title": "Conclusion", + "key_points": ["Summary", "Next Steps"], + "subsections": [], + "importance": "medium", + "estimated_length": "short" + } + ], + "key_topics": ["General Information"], + "tone": "formal", + "recommended_format": "markdown", + "formatting_requirements": ["Clear headings", "Professional formatting"], + "executive_summary": "A comprehensive documentation covering the requested topics.", + "feedback": "Created documentation based on your requirements." + } - async def _generate_simple_document(self, prompt: str, context: str, document_type: str, - title: str, output_label: str, output_description: str, - format_type: str) -> str: + async def _create_document_multi_step(self, prompt: str, context: str, output_label: str, + output_description: str, documentation_plan: Dict) -> Dict: """ - Generate a simple document without complex structure. + Create a document using a multi-step approach with separate AI calls for each section. Args: - prompt: Task description + prompt: Original task prompt context: Document context - document_type: Document type - title: Document title output_label: Output filename output_description: Description of desired output - format_type: Output format + documentation_plan: Documentation plan from AI Returns: - Generated document content + Document object """ - if not self.ai_service: - return f"# {title}\n\nDocument generation not possible: AI service not available." + # Determine format from filename + format_type = output_label.split('.')[-1].lower() if '.' in output_label else "md" - generation_prompt = f""" - Create a precise, focused {document_type} with the title "{title}" based on: + # Map format to content_type + content_type_map = { + "md": "text/markdown", + "markdown": "text/markdown", + "html": "text/html", + "txt": "text/plain", + "text": "text/plain", + "json": "application/json", + "csv": "text/csv" + } - TASK: - {prompt} + content_type = content_type_map.get(format_type, "text/plain") - CONTEXT: - {context if context else 'No additional context available.'} + # Get document information + title = documentation_plan.get("title", "Documentation") + document_type = documentation_plan.get("document_type", "document") + audience = documentation_plan.get("audience", "general") + tone = documentation_plan.get("tone", "formal") + key_topics = documentation_plan.get("key_topics", []) + formatting_requirements = documentation_plan.get("formatting_requirements", []) - OUTPUT REQUIREMENTS: - - Filename: {output_label} - - Description: {output_description} - - Format: {format_type} - - The document should be clear, precise, and to the point, without a complex chapter structure. - Format it according to the output format ({format_type}). - - The document must perfectly match the {format_type} format. - """ + # Get the detailed structure + detailed_structure = documentation_plan.get("detailed_structure", []) + if not detailed_structure: + # Fallback structure if none provided + detailed_structure = [ + { + "title": "Introduction", + "key_points": ["Purpose", "Scope"], + "importance": "high" + }, + { + "title": "Main Content", + "key_points": ["Core Information"], + "importance": "high" + }, + { + "title": "Conclusion", + "key_points": ["Summary", "Next Steps"], + "importance": "medium" + } + ] try: - content = await self.ai_service.call_api([ - {"role": "system", "content": f"You create precise, focused documentation in {format_type} format."}, - {"role": "user", "content": generation_prompt} + # Step 1: Generate document introduction + intro_prompt = f""" + Create the introduction for a {document_type} titled "{title}". + + DOCUMENT OVERVIEW: + - Type: {document_type} + - Audience: {audience} + - Tone: {tone} + - Key Topics: {', '.join(key_topics)} + - Format: {format_type} + + TASK CONTEXT: {prompt} + + This introduction should: + 1. Clearly state the purpose and scope of the document + 2. Provide context and background information + 3. Outline what the reader will find in the document + 4. Set the appropriate tone for the {audience} audience + + The introduction should be professional and engaging, formatted according to {format_type} standards. + """ + + introduction = await self.ai_service.call_api([ + {"role": "system", "content": f"You are a documentation expert creating an introduction in {format_type} format."}, + {"role": "user", "content": intro_prompt} ]) - # For markdown format, ensure the title is at the beginning - if format_type == "markdown" and not content.strip().startswith("# "): - content = f"# {title}\n\n{content}" + # Step 2: Generate executive summary (if applicable) + if document_type in ["report", "whitepaper", "case study"]: + summary_prompt = f""" + Create an executive summary for a {document_type} titled "{title}". + + DOCUMENT OVERVIEW: + - Type: {document_type} + - Audience: {audience} + - Key Topics: {', '.join(key_topics)} + + TASK CONTEXT: {prompt} + + This executive summary should: + 1. Provide a concise overview of the entire document + 2. Highlight key findings, recommendations, or conclusions + 3. Be suitable for executives or busy readers who may only read this section + 4. Be professionally formatted according to {format_type} standards + + Keep the summary focused and impactful, approximately 200-300 words. + """ + + executive_summary = await self.ai_service.call_api([ + {"role": "system", "content": f"You are a documentation expert creating an executive summary in {format_type} format."}, + {"role": "user", "content": summary_prompt} + ]) + else: + executive_summary = "" - return content - except Exception as e: - logger.error(f"Error in document generation: {str(e)}") - return f"# {title}\n\nError in document generation: {str(e)}" - - async def _generate_default_document(self, prompt: str, context: str, document_type: str, title: str) -> str: - """ - Generate a default markdown document when no specific output specifications are present. - - Args: - prompt: Task description - context: Document context - document_type: Document type - title: Document title + # Step 3: Generate each section + sections = [] - Returns: - Generated document content - """ - if not self.ai_service: - return f"# {title}\n\nDocument generation not possible: AI service not available." - - generation_prompt = f""" - Create a structured {document_type} with the title "{title}" based on: - - TASK: - {prompt} - - CONTEXT: - {context if context else 'No additional context available.'} - - Format the document with markdown syntax and create a clear, professional structure. - """ - - try: - content = await self.ai_service.call_api([ - {"role": "system", "content": "You create structured documentation in markdown format."}, - {"role": "user", "content": generation_prompt} + for section in detailed_structure: + section_title = section.get("title", "Section") + key_points = section.get("key_points", []) + subsections = section.get("subsections", []) + importance = section.get("importance", "medium") + + # Adjust depth based on importance + detail_level = "high" if importance == "high" else "medium" + + section_prompt = f""" + Create the "{section_title}" section for a {document_type} titled "{title}". + + SECTION DETAILS: + - Title: {section_title} + - Key Points to Cover: {', '.join(key_points)} + - Subsections: {', '.join(subsections)} + - Detail Level: {detail_level} + + DOCUMENT CONTEXT: + - Type: {document_type} + - Audience: {audience} + - Tone: {tone} + - Format: {format_type} + + TASK CONTEXT: {prompt} + + AVAILABLE INFORMATION: + {context[:500]}... (truncated) + + This section should: + 1. Be comprehensive and well-structured + 2. Cover all the key points listed + 3. Include the specified subsections with appropriate headings + 4. Maintain a {tone} tone suitable for the {audience} audience + 5. Be properly formatted according to {format_type} standards + 6. Include specific examples, data, or evidence where appropriate + + Be thorough in your coverage of this section, providing substantive content. + """ + + section_content = await self.ai_service.call_api([ + {"role": "system", "content": f"You are a documentation expert creating detailed content for the {section_title} section."}, + {"role": "user", "content": section_prompt} + ]) + + sections.append(section_content) + + # Step 4: Generate conclusion + conclusion_prompt = f""" + Create the conclusion for a {document_type} titled "{title}". + + DOCUMENT OVERVIEW: + - Type: {document_type} + - Audience: {audience} + - Key Topics: {', '.join(key_topics)} + + TASK CONTEXT: {prompt} + + This conclusion should: + 1. Summarize the key points covered in the document + 2. Provide closure to the topics discussed + 3. Include any relevant recommendations or next steps + 4. Leave the reader with a clear understanding of the document's significance + + The conclusion should be professional and impactful, formatted according to {format_type} standards. + """ + + conclusion = await self.ai_service.call_api([ + {"role": "system", "content": f"You are a documentation expert creating a conclusion in {format_type} format."}, + {"role": "user", "content": conclusion_prompt} ]) - # Ensure the title is at the beginning - if not content.strip().startswith("# "): - content = f"# {title}\n\n{content}" + # Step 5: Assemble the complete document + if format_type in ["md", "markdown"]: + # Markdown format + document_content = f"# {title}\n\n" + + if executive_summary: + document_content += f"## Executive Summary\n\n{executive_summary}\n\n" + + document_content += f"{introduction}\n\n" + + for i, section_content in enumerate(sections): + # Ensure section starts with heading if not already + section_title = detailed_structure[i].get("title", f"Section {i+1}") + if not section_content.strip().startswith("#"): + document_content += f"## {section_title}\n\n" + document_content += f"{section_content}\n\n" + + document_content += f"## Conclusion\n\n{conclusion}\n" + + elif format_type == "html": + # HTML format + document_content = f"\n\nThere was an error generating the documentation: {str(e)}
" + else: + content = f"Error in Documentation\n\nThere was an error generating the documentation: {str(e)}" + + return { + "label": output_label, + "content": content, + "metadata": { + "content_type": content_type + } + } # Factory function for the Documentation agent def get_documentation_agent(): - """ - Factory function that returns an instance of the Documentation agent. - - Returns: - An instance of the Documentation agent - """ + """Returns an instance of the Documentation agent.""" return AgentDocumentation() \ No newline at end of file diff --git a/modules/chat_agent_webcrawler.py b/modules/chat_agent_webcrawler.py index b5f1902a..52efdadd 100644 --- a/modules/chat_agent_webcrawler.py +++ b/modules/chat_agent_webcrawler.py @@ -1,6 +1,6 @@ """ Webcrawler agent for research and retrieval of information from the web. -Optimized for the new task-based processing. +Reimagined with an output-first, AI-driven approach. """ import logging @@ -20,7 +20,7 @@ from modules.configuration import APP_CONFIG logger = logging.getLogger(__name__) class AgentWebcrawler(AgentBase): - """Agent for web research and information retrieval""" + """AI-driven agent for web research and information retrieval""" def __init__(self): """Initialize the webcrawler agent""" @@ -37,202 +37,564 @@ class AgentWebcrawler(AgentBase): # Web crawling configuration self.max_url = int(APP_CONFIG.get("Agent_Webcrawler_MAX_URLS", "5")) - self.max_key = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_KEYWORDS", "3")) - self.max_result = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_RESULTS", "5")) + self.max_search_terms = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_KEYWORDS", "3")) + self.max_results = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_RESULTS", "5")) self.timeout = int(APP_CONFIG.get("Agent_Webcrawler_TIMEOUT", "30")) + self.search_engine = APP_CONFIG.get("Agent_Webcrawler_SEARCH_ENGINE", "https://html.duckduckgo.com/html/?q=") + self.user_agent = APP_CONFIG.get("Agent_Webcrawler_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") def set_dependencies(self, ai_service=None): """Set external dependencies for the agent.""" self.ai_service = ai_service - async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]: """ - Process a standardized task structure and conduct web research. + Process a task by focusing on required outputs and using AI to guide the research process. Args: - task: A dictionary containing: - - task_id: Unique ID for this task - - prompt: The main instruction for the agent - - input_documents: List of documents to process - - output_specifications: List of required output documents - - context: Additional contextual information - + task: Task dictionary with prompt, input_documents, output_specifications + Returns: - A dictionary containing: - - feedback: Text response explaining the research results - - documents: List of created document objects + Dictionary with feedback and documents """ try: - # Extract relevant task information + # Extract task information prompt = task.get("prompt", "") output_specs = task.get("output_specifications", []) - # Check if AI service is available + # Check AI service if not self.ai_service: - logger.error("No AI service configured for the Webcrawler agent") return { - "feedback": "The Webcrawler agent is not properly configured.", + "feedback": "The Webcrawler agent requires an AI service to function effectively.", "documents": [] } - # Check if this is a web research request - is_web_research = await self._is_web_research_request(prompt) - if not is_web_research: - logger.info("Request rejected: not a web research task") + # Create research plan + research_plan = await self._create_research_plan(prompt) + + # Check if this is truly a web research task + if not research_plan.get("requires_web_research", True): return { - "feedback": "This request doesn't appear to require web research.", + "feedback": "This task doesn't appear to require web research. Please try a different agent.", "documents": [] } - # Proceed with web research - logger.info(f"Web research for: {prompt[:50]}...") + # Gather raw material through web research + raw_results = await self._gather_research_material(research_plan) - # Create search strategy - search_strategy = await self._create_search_strategy(prompt) - search_keys = search_strategy.get("skey", []) - search_urls = search_strategy.get("url", []) + # Format results into requested output documents + documents = await self._create_output_documents( + prompt, + raw_results, + output_specs, + research_plan + ) - if search_keys: - logger.info(f"Searching for {len(search_keys)} key terms: {', '.join(search_keys[:2])}...") - - if search_urls: - logger.info(f"Searching in {len(search_urls)} direct URLs: {', '.join(search_urls[:2])}...") - - # Execute search - results = [] - - # Process search terms - for keyword in search_keys: - logger.info(f"Searching the web for: '{keyword}'") - keyword_results = self._search_web(keyword) - results.extend(keyword_results) - logger.info(f"Found: {len(keyword_results)} results for '{keyword}'") - - # Process direct URLs - for url in search_urls: - logger.info(f"Extracting content from: {url}") - soup = self._read_url(url) - - # Extract title from the page, if available - title = self._extract_title(soup, url) - - result = self._parse_result(soup, title, url) - results.append(result) - logger.info(f"Extracted: '{title}' from {url}") - - # Process results for final output - logger.info(f"Analyzing {len(results)} web results") - - # Generate summaries for each result - processed_results = [] - for i, result in enumerate(results): - result_data_limited = self._limit_text(result['data'], max_chars=10000) - - logger.info(f"Analyzing result {i+1}/{len(results)}: {result['title'][:30]}...") - - # No AI service available, create minimal summary - if not self.ai_service: - content_summary = f"Extract from {result['url']} ({len(result_data_limited)} characters)" - else: - # Generate summary with AI - content_summary = await self._summarize_result(result_data_limited, prompt) - - processed_result = { - "title": result['title'], - "url": result['url'], - "snippet": result['snippet'], - "summary": content_summary - } - - processed_results.append(processed_result) - - # Create overall summary - all_summaries = "\n\n".join([r["summary"] for r in processed_results]) - all_summaries_limited = self._limit_text(all_summaries, max_chars=10000) - - logger.info("Creating overall summary of web research") - - if not self.ai_service: - final_summary = f"Summary of {len(processed_results)} web research results" - else: - final_summary = await self.ai_service.call_api([ - {"role": "system", "content": "You create concise summaries of research results."}, - {"role": "user", "content": f"Please summarize these findings in 5-6 sentences: {all_summaries_limited}\n"} - ]) - - # Get localized headers for output - headers = await self._get_localized_headers(prompt) - - # Create document objects based on output specifications - generated_documents = [] - - # Generate appropriate document for each requested output - for spec in output_specs: - output_label = spec.get("label", "") - output_description = spec.get("description", "") - - # Determine output format based on file extension - format_type = self._determine_format_type(output_label) - - # Generate content based on format and requirements - if format_type == "markdown" or format_type == "text": - content = self._format_results_as_markdown(processed_results, final_summary, headers) - elif format_type == "html": - md_content = self._format_results_as_markdown(processed_results, final_summary, headers) - content = markdown.markdown(md_content) - elif format_type == "json": - content = json.dumps({ - "summary": final_summary, - "results": processed_results - }, indent=2, ensure_ascii=False) - elif format_type == "csv": - csv_lines = ["Title,URL,Snippet"] - for result in processed_results: - # Escape commas and quotes in fields - title = result["title"].replace('"', '""') - url = result["url"].replace('"', '""') - snippet = result["snippet"].replace('"', '""') - csv_line = f'"{title}","{url}","{snippet}"' - csv_lines.append(csv_line) - content = "\n".join(csv_lines) - else: - # Default: Markdown - content = self._format_results_as_markdown(processed_results, final_summary, headers) - - # Add document to results list - generated_documents.append({ - "label": output_label, - "content": content - }) - - # If no specific outputs requested, return standard document - if not output_specs: - content = self._format_results_as_markdown(processed_results, final_summary, headers) - generated_documents.append({ - "label": "web_research_results.md", - "content": content - }) - - # Create feedback for response - feedback = f"I conducted web research on '{prompt[:50]}...' and found {len(processed_results)} relevant results." - - logger.info("Web research completed successfully") + # Generate feedback + feedback = research_plan.get("feedback", f"I conducted web research on '{prompt[:50]}...' and gathered information from {len(raw_results)} relevant sources.") return { "feedback": feedback, - "documents": generated_documents + "documents": documents } except Exception as e: - error_msg = f"Error during web research: {str(e)}" - logger.error(error_msg) + logger.error(f"Error during web research: {str(e)}", exc_info=True) return { - "feedback": f"An error occurred during the web research: {str(e)}", + "feedback": f"Error during web research: {str(e)}", "documents": [] } + + async def _create_research_plan(self, prompt: str) -> Dict[str, Any]: + """ + Use AI to create a detailed research plan. + + Args: + prompt: The research query - + Returns: + Research plan dictionary + """ + research_prompt = f""" + Create a detailed web research plan for this task: "{prompt}" + + Analyze the request carefully and create a structured plan in JSON format with the following elements: + {{ + "requires_web_research": true/false, # Whether this genuinely requires web research + "research_questions": ["question1", "question2", ...], # 2-4 specific questions to answer + "search_terms": ["term1", "term2", ...], # Up to {self.max_search_terms} effective search terms + "direct_urls": ["url1", "url2", ...], # Any URLs directly mentioned in the request (up to {self.max_url}) + "expected_sources": ["type1", "type2", ...], # Types of sources that would be most valuable + "content_focus": "what specific content to extract or focus on", + "feedback": "explanation of how the research will be conducted" + }} + + Respond with ONLY the JSON object, no additional text or explanations. + """ + + try: + # Get research plan from AI + response = await self.ai_service.call_api([ + {"role": "system", "content": "You are a web research planning expert. Create precise research plans in JSON format only."}, + {"role": "user", "content": research_prompt} + ]) + + # Extract JSON + json_start = response.find('{') + json_end = response.rfind('}') + 1 + + if json_start >= 0 and json_end > json_start: + plan = json.loads(response[json_start:json_end]) + + # Ensure we have the expected fields with defaults if missing + if "search_terms" not in plan: + plan["search_terms"] = [prompt] + if "direct_urls" not in plan: + plan["direct_urls"] = [] + if "research_questions" not in plan: + plan["research_questions"] = ["What information can be found about this topic?"] + + return plan + else: + # Fallback plan + return { + "requires_web_research": True, + "research_questions": ["What information can be found about this topic?"], + "search_terms": [prompt], + "direct_urls": [], + "expected_sources": ["Web pages", "Articles"], + "content_focus": "Relevant information about the topic", + "feedback": f"I'll conduct web research on '{prompt}' and gather relevant information." + } + + except Exception as e: + logger.warning(f"Error creating research plan: {str(e)}") + # Simple fallback plan + return { + "requires_web_research": True, + "research_questions": ["What information can be found about this topic?"], + "search_terms": [prompt], + "direct_urls": [], + "expected_sources": ["Web pages", "Articles"], + "content_focus": "Relevant information about the topic", + "feedback": f"I'll conduct web research on '{prompt}' and gather relevant information." + } + + async def _gather_research_material(self, research_plan: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Gather research material based on the research plan. + + Args: + research_plan: Research plan dictionary + + Returns: + List of research results + """ + all_results = [] + + # Process direct URLs + direct_urls = research_plan.get("direct_urls", [])[:self.max_url] + for url in direct_urls: + logger.info(f"Processing direct URL: {url}") + try: + # Fetch and extract content + soup = self._read_url(url) + + if soup: + # Extract title and content + title = self._extract_title(soup, url) + content = self._extract_main_content(soup) + + # Add to results + all_results.append({ + "title": title, + "url": url, + "source_type": "direct_url", + "content": content, + "summary": "" # Will be filled later + }) + except Exception as e: + logger.warning(f"Error processing URL {url}: {str(e)}") + + # Process search terms + search_terms = research_plan.get("search_terms", [])[:self.max_search_terms] + for term in search_terms: + logger.info(f"Searching for: {term}") + try: + # Perform search + search_results = self._search_web(term) + + # Process each search result + for result in search_results: + # Check if URL is already in results + if not any(r["url"] == result["url"] for r in all_results): + all_results.append({ + "title": result["title"], + "url": result["url"], + "source_type": "search_result", + "content": result["data"], + "snippet": result["snippet"], + "summary": "" # Will be filled later + }) + + # Stop if we've reached the maximum results + if len(all_results) >= self.max_results: + break + except Exception as e: + logger.warning(f"Error searching for {term}: {str(e)}") + + # Stop if we've reached the maximum results + if len(all_results) >= self.max_results: + break + + # Create summaries in parallel for all results + all_results = await self._summarize_all_results(all_results, research_plan) + + return all_results + + async def _summarize_all_results(self, results: List[Dict[str, Any]], research_plan: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Create summaries for all research results. + + Args: + results: List of research results + research_plan: Research plan with questions and focus + + Returns: + Results with added summaries + """ + for i, result in enumerate(results): + logger.info(f"Summarizing result {i+1}/{len(results)}: {result['title'][:30]}...") + + try: + # Limit content length to avoid token issues + content = self._limit_text(result.get("content", ""), max_chars=8000) + research_questions = research_plan.get("research_questions", ["What relevant information does this page contain?"]) + content_focus = research_plan.get("content_focus", "Relevant information") + + # Create summary using AI + summary_prompt = f""" + Summarize this web page content based on these research questions: + {', '.join(research_questions)} + + Focus on: {content_focus} + + Web page: {result['url']} + Title: {result['title']} + + Content: + {content} + + Create a concise summary that: + 1. Directly answers the research questions if possible + 2. Extracts the most relevant information from the page + 3. Includes specific facts, figures, or quotes if available + 4. Is around 2000 characters long + + Only include information actually found in the content. No fabrications or assumptions. + """ + + if self.ai_service: + summary = await self.ai_service.call_api([ + {"role": "system", "content": "You summarize web content accurately and concisely, focusing only on what is actually in the content."}, + {"role": "user", "content": summary_prompt} + ]) + + # Store the summary + result["summary"] = summary + else: + # Fallback if no AI service + result["summary"] = f"Content from {result['url']} ({len(content)} characters)" + + except Exception as e: + logger.warning(f"Error summarizing result {i+1}: {str(e)}") + result["summary"] = f"Error creating summary: {str(e)}" + + return results + + async def _create_output_documents(self, prompt: str, results: List[Dict[str, Any]], + output_specs: List[Dict[str, Any]], research_plan: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Create output documents based on research results and specifications. + + Args: + prompt: Original research prompt + results: List of research results + output_specs: Output specifications + research_plan: Research plan + + Returns: + List of output documents + """ + # If no output specs provided, create default output + if not output_specs: + output_specs = [{ + "label": "web_research_results.md", + "description": "Comprehensive web research results" + }] + + # Generate documents + documents = [] + + # Process each output specification + for spec in output_specs: + output_label = spec.get("label", "") + output_description = spec.get("description", "") + + # Determine format based on file extension + format_type = self._determine_format_type(output_label) + + # Create appropriate document based on format + if format_type == "json": + # JSON output - structured data + document = await self._create_json_document(prompt, results, research_plan, output_label) + elif format_type == "csv": + # CSV output - tabular data + document = await self._create_csv_document(results, output_label) + else: + # Text-based output (markdown, html, text) - narrative report + document = await self._create_narrative_document( + prompt, results, research_plan, format_type, output_label, output_description + ) + + documents.append(document) + + return documents + + async def _create_narrative_document(self, prompt: str, results: List[Dict[str, Any]], + research_plan: Dict[str, Any], format_type: str, + output_label: str, output_description: str) -> Dict[str, Any]: + """ + Create a narrative document (markdown, html, text) from research results. + + Args: + prompt: Original research prompt + results: Research results + research_plan: Research plan + format_type: Output format (markdown, html, text) + output_label: Output filename + output_description: Output description + + Returns: + Document object + """ + # Create content based on format + if format_type == "markdown": + content_type = "text/markdown" + template_format = "markdown" + elif format_type == "html": + content_type = "text/html" + template_format = "html" + else: + content_type = "text/plain" + template_format = "text" + + # Prepare research context + research_questions = research_plan.get("research_questions", []) + search_terms = research_plan.get("search_terms", []) + + # Create document structure based on results + sources_summary = [] + for result in results: + sources_summary.append({ + "title": result.get("title", "Untitled"), + "url": result.get("url", ""), + "summary": result.get("summary", ""), + "snippet": result.get("snippet", "") + }) + + # Truncate content for prompt + sources_json = json.dumps(sources_summary, indent=2) + if len(sources_json) > 10000: + # Logic to truncate each summary while preserving structure + for i in range(len(sources_summary)): + if len(sources_json) <= 10000: + break + # Gradually truncate summaries + sources_summary[i]["summary"] = sources_summary[i]["summary"][:500] + "..." + sources_json = json.dumps(sources_summary, indent=2) + + # Create report prompt + report_prompt = f""" + Create a comprehensive {format_type} research report based on the following web research: + + TASK: {prompt} + + RESEARCH QUESTIONS: + {', '.join(research_questions)} + + SEARCH TERMS USED: + {', '.join(search_terms)} + + SOURCES AND FINDINGS: + {sources_json} + + REPORT DETAILS: + - Format: {template_format} + - Filename: {output_label} + - Description: {output_description} + + Create a well-structured report that: + 1. Includes an executive summary of key findings + 2. Addresses each research question directly + 3. Integrates information from all relevant sources + 4. Cites sources appropriately for each piece of information + 5. Provides a comprehensive synthesis of the research + 6. Is formatted professionally and appropriately for {template_format} + + The report should be scholarly, accurate, and focused on the original research task. + """ + + try: + # Generate report with AI + report_content = await self.ai_service.call_api([ + {"role": "system", "content": f"You create professional research reports in {template_format} format."}, + {"role": "user", "content": report_prompt} + ]) + + # Convert to HTML if needed + if format_type == "html" and not report_content.lower().startswith("An error occurred: {str(e)}
" + else: + content = f"WEB RESEARCH ERROR\n\nAn error occurred: {str(e)}" + + return { + "label": output_label, + "content": content, + "metadata": { + "content_type": content_type + } + } + + async def _create_json_document(self, prompt: str, results: List[Dict[str, Any]], + research_plan: Dict[str, Any], output_label: str) -> Dict[str, Any]: + """ + Create a JSON document from research results. + + Args: + prompt: Original research prompt + results: Research results + research_plan: Research plan + output_label: Output filename + + Returns: + Document object + """ + try: + # Create structured data + sources_data = [] + for result in results: + sources_data.append({ + "title": result.get("title", "Untitled"), + "url": result.get("url", ""), + "summary": result.get("summary", ""), + "snippet": result.get("snippet", ""), + "source_type": result.get("source_type", "") + }) + + # Create metadata + metadata = { + "query": prompt, + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), + "research_questions": research_plan.get("research_questions", []), + "search_terms": research_plan.get("search_terms", []) + } + + # Compile complete report object + json_content = { + "metadata": metadata, + "summary": research_plan.get("feedback", "Web research results"), + "sources": sources_data + } + + # Convert to JSON string + content = json.dumps(json_content, indent=2) + + return { + "label": output_label, + "content": content, + "metadata": { + "content_type": "application/json" + } + } + + except Exception as e: + logger.error(f"Error creating JSON document: {str(e)}") + return { + "label": output_label, + "content": json.dumps({"error": str(e)}), + "metadata": { + "content_type": "application/json" + } + } + + async def _create_csv_document(self, results: List[Dict[str, Any]], output_label: str) -> Dict[str, Any]: + """ + Create a CSV document from research results. + + Args: + results: Research results + output_label: Output filename + + Returns: + Document object + """ + try: + # Create CSV header + csv_lines = ["Title,URL,Source Type,Snippet"] + + # Add results + for result in results: + # Escape CSV fields + title = result.get("title", "").replace('"', '""') + url = result.get("url", "").replace('"', '""') + source_type = result.get("source_type", "").replace('"', '""') + snippet = result.get("snippet", "").replace('"', '""') + + csv_lines.append(f'"{title}","{url}","{source_type}","{snippet}"') + + # Combine into CSV content + content = "\n".join(csv_lines) + + return { + "label": output_label, + "content": content, + "metadata": { + "content_type": "text/csv" + } + } + + except Exception as e: + logger.error(f"Error creating CSV document: {str(e)}") + return { + "label": output_label, + "content": "Error,Error\nFailed to create CSV,{0}".format(str(e)), + "metadata": { + "content_type": "text/csv" + } + } + def _determine_format_type(self, output_label: str) -> str: """ Determine the format type based on the filename. @@ -259,282 +621,6 @@ class AgentWebcrawler(AgentBase): # Default to markdown return "markdown" - def _format_results_as_markdown(self, results: List[Dict[str, Any]], - summary: str, headers: Dict[str, str]) -> str: - """ - Format research results as markdown. - - Args: - results: List of results - summary: Summary of all results - headers: Localized headers - - Returns: - Formatted markdown text - """ - md_content = f"# {headers['web_research_results']}\n\n" - - md_content += f"## {headers['summary']}\n\n{summary}\n\n" - - if results: - md_content += f"## {headers['detailed_results']}\n\n" - - for i, result in enumerate(results, 1): - md_content += f"### {i}. {result['title']}\n\n" - md_content += f"**{headers['url']}**: {result['url']}\n\n" - md_content += f"**{headers['snippet']}**: {result['snippet']}\n\n" - md_content += f"**{headers['content']}**: {result['summary']}\n\n" - - # Add separator between results (except for the last one) - if i < len(results): - md_content += "---\n\n" - - return md_content - - async def _is_web_research_request(self, prompt: str) -> bool: - """ - Use AI to determine if a request requires web research. - - Args: - prompt: The user request - - Returns: - True if it is explicitly a web research request, False otherwise - """ - if not self.ai_service: - # Fallback to simpler detection if no AI service is available - return self._simple_web_detection(prompt) - - try: - # Create prompt to analyze if this is a web research request - analysis_prompt = f""" - Analyze the following request and determine if it explicitly requires web research or online information. - - REQUEST: {prompt} - - A request requires web research if: - 1. It explicitly asks for searching information online - 2. It contains URLs or references to websites - 3. It requests current information that would be available on the web - 4. It asks for information from web sources - 5. It implicitly requires current information from the internet - - Reply ONLY with a single word - either "YES" if web research is required, or "NO" if not. - """ - - # Call AI for analysis - response = await self.ai_service.call_api([ - {"role": "system", "content": "You determine if a request requires web research. Always respond with just YES or NO."}, - {"role": "user", "content": analysis_prompt} - ]) - - # Clean response and check - response = response.strip().upper() - - return "YES" in response - - except Exception as e: - # Log error but don't fail, fallback to simpler detection - logger.warning(f"Error in AI detection of web research requests: {str(e)}") - return self._simple_web_detection(prompt) - - def _simple_web_detection(self, prompt: str) -> bool: - """ - Simpler fallback method for detecting web research requests based on URLs. - - Args: - prompt: The user request - - Returns: - True if there are clear URL indicators, False otherwise - """ - # URLs in the request strongly indicate web research - url_indicators = ["http://", "https://", "www.", ".com", ".org", ".net", ".edu", ".gov"] - web_terms = ["search", "find online", "look up", "web", "internet", "website"] - - # Check for URL patterns in the request - contains_url = any(indicator in prompt.lower() for indicator in url_indicators) - contains_web_term = any(term in prompt.lower() for term in web_terms) - - return contains_url or contains_web_term - - async def _create_search_strategy(self, prompt: str) -> Dict[str, List[str]]: - """ - Create a search strategy based on the request. - - Args: - prompt: The user request - - Returns: - Search strategy with URLs and search terms - """ - if not self.ai_service: - # Fallback to simple strategy - return {"skey": [prompt], "url": []} - - try: - # AI prompt to create a search strategy - strategy_prompt = f"""Create a comprehensive web research strategy for the following task: - '{prompt.replace("'","")}' - - Return the results as a Python dictionary with these specific keys: - - 'url': A list of up to {self.max_url} specific URLs extracted from the task. - - 'skey': A list of up to {self.max_key} key phrases to search for on the web. These should be precise, diverse, and targeted to get the most relevant information. - - If specific URLs are given and the task only requires analyzing these URLs, leave 'skey' empty. - - Format your response as a valid JSON object with these two keys. Don't add any explanatory text. - """ - - # Call AI for search strategy - content_text = await self.ai_service.call_api([ - {"role": "system", "content": "You are a web research expert who develops precise search strategies."}, - {"role": "user", "content": strategy_prompt} - ]) - - # Remove JSON code block markers if present - if content_text.startswith("```json"): - end_marker = "```" - end_index = content_text.rfind(end_marker) - if end_index != -1: - content_text = content_text[7:end_index].strip() - elif content_text.startswith("```"): - end_marker = "```" - end_index = content_text.rfind(end_marker) - if end_index != -1: - content_text = content_text[3:end_index].strip() - - # Extract only the JSON part (if surrounded by text) - json_match = re.search(r'(\{.*\})', content_text, re.DOTALL) - if json_match: - content_text = json_match.group(1) - - # Parse JSON and return - strategy = json.loads(content_text) - return strategy - - except Exception as e: - logger.error(f"Error creating search strategy: {str(e)}") - # Simple fallback strategy - return {"skey": [prompt], "url": []} - - async def _summarize_result(self, result_data: str, original_prompt: str) -> str: - """ - Create a summary of a search result using AI. - - Args: - result_data: The data to summarize - original_prompt: The original request - - Returns: - Summary of the result - """ - if not self.ai_service: - return f"Summary of {len(result_data)} characters not available (AI service not available)" - - try: - # Instructions for summarization - summary_prompt = f""" - Summarize this search result according to the original request in about 2000 characters. - - Original request = '{original_prompt.replace("'","")}' - - Focus on the most important findings and connect them to the original request. - Extract only relevant and high-quality information. - - Here's the search result: - {result_data} - """ - - # Call AI for summary - summary = await self.ai_service.call_api([ - {"role": "system", "content": "You are an information analyst who summarizes web content precisely and relevantly."}, - {"role": "user", "content": summary_prompt} - ]) - - # Limit to ~2000 characters - return summary[:2000] - - except Exception as e: - logger.error(f"Error summarizing result: {str(e)}") - return "Error creating summary" - - async def _get_localized_headers(self, text: str) -> Dict[str, str]: - """ - Determine localized headers for web research results based on detected language. - - Args: - text: Text for language detection - - Returns: - Dictionary with localized headers - """ - # Default English headers - headers = { - "web_research_results": "Web Research Results", - "summary": "Summary", - "detailed_results": "Detailed Results", - "url": "URL", - "snippet": "Snippet", - "content": "Content" - } - - if not self.ai_service: - return headers - - try: - # Detect language - language_prompt = f"What language is this text written in? Answer with just the language name: {text[:200]}" - language = await self.ai_service.call_api([ - {"role": "system", "content": "You determine the language of a text and return only the language name."}, - {"role": "user", "content": language_prompt} - ]) - - language = language.strip().lower() - - # English language or language detection failed, return default headers - if language in ["english", "en", ""]: - return headers - - # Translate headers if language recognized but no predefined translation - translation_prompt = f""" - Translate these web research result headers to {language}: - - Web Research Results - Summary - Detailed Results - URL - Snippet - Content - - Return a JSON object with these keys: - web_research_results, summary, detailed_results, url, snippet, content - """ - - # Call AI for translation - response = await self.ai_service.call_api([ - {"role": "system", "content": "You translate headers to the specified language and return them as JSON."}, - {"role": "user", "content": translation_prompt} - ]) - - # Extract JSON - json_match = re.search(r'\{.*\}', response, re.DOTALL) - - if json_match: - try: - translated_headers = json.loads(json_match.group(0)) - return translated_headers - except json.JSONDecodeError: - logger.warning(f"Error parsing translated headers JSON") - - except Exception as e: - # Log error but continue with English headers - logger.warning(f"Error translating headers: {str(e)}") - - return headers - def _search_web(self, query: str) -> List[Dict[str, str]]: """ Conduct a web search and return the results. @@ -546,10 +632,10 @@ class AgentWebcrawler(AgentBase): List of search results """ formatted_query = quote_plus(query) - url = f"{APP_CONFIG.get('Agent_Webcrawler_SEARCH_ENGINE', 'https://html.duckduckgo.com/html/?q=')}{formatted_query}" + url = f"{self.search_engine}{formatted_query}" search_results_soup = self._read_url(url) - if not isinstance(search_results_soup, BeautifulSoup) or not search_results_soup.select('.result'): + if not search_results_soup or not search_results_soup.select('.result'): logger.warning(f"No search results found for: {query}") return [] @@ -588,11 +674,13 @@ class AgentWebcrawler(AgentBase): snippet_element = result.select_one('.result__snippet') snippet = snippet_element.text.strip() if snippet_element else 'No description' - # Get actual page content for the data field - target_page_soup = self._read_url(extracted_url) - - # Use new content extraction method to limit content size - content = self._extract_main_content(target_page_soup) + # Get actual page content + try: + target_page_soup = self._read_url(extracted_url) + content = self._extract_main_content(target_page_soup) + except Exception as e: + logger.warning(f"Error extracting content from {extracted_url}: {str(e)}") + content = f"Error extracting content: {str(e)}" results.append({ 'title': title, @@ -601,8 +689,8 @@ class AgentWebcrawler(AgentBase): 'data': content }) - # Limit number of results if needed - if len(results) >= self.max_result: + # Limit number of results + if len(results) >= self.max_results: break return results @@ -615,10 +703,13 @@ class AgentWebcrawler(AgentBase): url: The URL to read Returns: - BeautifulSoup object with the content or empty on errors + BeautifulSoup object with the content or None on errors """ + if not url or not url.startswith(('http://', 'https://')): + return None + headers = { - 'User-Agent': APP_CONFIG.get("Agent_Webcrawler_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"), + 'User-Agent': self.user_agent, 'Accept': 'text/html,application/xhtml+xml,application/xml', 'Accept-Language': 'en-US,en;q=0.9', } @@ -629,18 +720,17 @@ class AgentWebcrawler(AgentBase): # Handling for status 202 if response.status_code == 202: - # Max 3 retries with increasing intervals + # Retry with backoff backoff_times = [0.5, 1.0, 2.0, 5.0] for wait_time in backoff_times: - time.sleep(wait_time) # Wait with increasing time + time.sleep(wait_time) response = requests.get(url, headers=headers, timeout=self.timeout) - # If no more 202, break if response.status_code != 202: break - # Raise for other error status codes + # Raise for error status codes response.raise_for_status() # Parse HTML @@ -648,8 +738,7 @@ class AgentWebcrawler(AgentBase): except Exception as e: logger.error(f"Error reading URL {url}: {str(e)}") - # Create empty BeautifulSoup object - return BeautifulSoup("", 'html.parser') + return None def _extract_title(self, soup: BeautifulSoup, url: str) -> str: """ @@ -662,7 +751,7 @@ class AgentWebcrawler(AgentBase): Returns: Extracted title """ - if not isinstance(soup, BeautifulSoup): + if not soup: return f"Error with {url}" # Extract title from title tag @@ -688,8 +777,8 @@ class AgentWebcrawler(AgentBase): Returns: Extracted main content as a string """ - if not isinstance(soup, BeautifulSoup): - return str(soup)[:max_chars] if soup else "" + if not soup: + return "" # Try to find main content elements in priority order main_content = None @@ -713,29 +802,6 @@ class AgentWebcrawler(AgentBase): # Limit to max_chars return text_content[:max_chars] - def _parse_result(self, soup: BeautifulSoup, title: str, url: str) -> Dict[str, str]: - """ - Parse a BeautifulSoup object into a result dictionary. - - Args: - soup: BeautifulSoup object of the webpage - title: Page title - url: Page URL - - Returns: - Dictionary with result data - """ - # Extract content - content = self._extract_main_content(soup) - - result = { - 'title': title, - 'url': url, - 'snippet': 'No description', # Default value - 'data': content - } - return result - def _limit_text(self, text: str, max_chars: int = 10000) -> str: """ Limit text to a maximum number of characters. @@ -760,10 +826,5 @@ class AgentWebcrawler(AgentBase): # Factory function for the Webcrawler agent def get_webcrawler_agent(): - """ - Factory function that returns an instance of the Webcrawler agent. - - Returns: - An instance of the Webcrawler agent - """ + """Returns an instance of the Webcrawler agent.""" return AgentWebcrawler() \ No newline at end of file diff --git a/notes/changelog.txt b/notes/changelog.txt index 3caac4e9..d3381032 100644 --- a/notes/changelog.txt +++ b/notes/changelog.txt @@ -25,6 +25,7 @@ streamline self.log_add --> to use in a standardized format and to reduce messag add connector to myoutlook +todo an agent for "code writing and editing" connected to the codebase, working in loops over each document...