From 5c066feb191fecb351d539c5946a096604bb3973 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Mon, 21 Apr 2025 18:16:13 +0200
Subject: [PATCH] all agents ai target-driven redesigned
---
modules/_backup_chat_agent_coder copy.py | 814 --------------
modules/_backup_lucydom_interface copy.py | 1183 ---------------------
modules/chat_agent_analyst.py | 1123 ++++++++++---------
modules/chat_agent_creative.py | 364 -------
modules/chat_agent_documentation.py | 720 +++++++------
modules/chat_agent_webcrawler.py | 1033 +++++++++---------
notes/changelog.txt | 1 +
7 files changed, 1495 insertions(+), 3743 deletions(-)
delete mode 100644 modules/_backup_chat_agent_coder copy.py
delete mode 100644 modules/_backup_lucydom_interface copy.py
delete mode 100644 modules/chat_agent_creative.py
diff --git a/modules/_backup_chat_agent_coder copy.py b/modules/_backup_chat_agent_coder copy.py
deleted file mode 100644
index 98b16a6a..00000000
--- a/modules/_backup_chat_agent_coder copy.py
+++ /dev/null
@@ -1,814 +0,0 @@
-"""
-Coder agent for development and execution of Python code.
-Optimized for the new task-based processing.
-"""
-
-import logging
-import json
-import re
-import uuid
-import os
-import subprocess
-import tempfile
-import shutil
-import sys
-from typing import Dict, Any, List, Optional, Tuple
-
-from modules.chat_registry import AgentBase
-
-logger = logging.getLogger(__name__)
-
-
-class AgentCoder(AgentBase):
- """Agent for development and execution of Python code"""
-
- def __init__(self):
- """Initialize the coder agent"""
- super().__init__()
- self.name = "coder"
- self.description = "Develops and executes Python code for data processing and automation"
- self.capabilities = [
- "code_development",
- "data_processing",
- "file_processing",
- "automation",
- "code_execution"
- ]
-
- # Executor settings
- self.executor_timeout = 60 # seconds
- self.executor_memory_limit = 512 # MB
-
- # AI service settings
- self.ai_temperature = 0.1 # Lower temperature for deterministic code generation
-
- # Auto-correction settings
- self.max_correction_attempts = 3 # Maximum number of correction attempts
-
- def set_dependencies(self, ai_service=None):
- """Set external dependencies for the agent."""
- self.ai_service = ai_service
-
- async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
- """
- Process a standardized task structure and perform code development/execution.
-
- Args:
- task: A dictionary containing:
- - task_id: Unique ID for this task
- - prompt: The main instruction for the agent
- - input_documents: List of documents to process
- - output_specifications: List of required output documents
- - context: Additional contextual information
-
- Returns:
- A dictionary containing:
- - feedback: Text response explaining the code execution
- - documents: List of created document objects
- """
- try:
- # Extract relevant task information
- prompt = task.get("prompt", "")
- input_documents = task.get("input_documents", [])
- output_specs = task.get("output_specifications", [])
- context_info = task.get("context", {})
-
- # Check if AI service is available
- if not self.ai_service:
- logger.error("No AI service configured for the Coder agent")
- return {
- "feedback": "The Coder agent is not properly configured.",
- "documents": []
- }
-
- # Extract context from input documents
- document_context = self._extract_document_context(input_documents)
-
- # Generate code based on the prompt and document context
- logger.info("Generating code based on the task")
- code_to_execute, requirements = await self._generate_code_from_prompt(prompt, document_context)
-
- if not code_to_execute:
- logger.warning("AI couldn't generate any code")
- return {
- "feedback": "I couldn't generate executable code based on the task. Please provide more detailed instructions.",
- "documents": []
- }
-
- logger.info(f"Code generated with AI ({len(code_to_execute)} characters)")
-
- # Collect created documents
- generated_documents = []
-
- # Add code as first document
- code_doc = {
- "label": "generated_code.py",
- "content": code_to_execute
- }
- generated_documents.append(code_doc)
-
- # Execute code with auto-correction loop
- execution_context = {
- "input_documents": input_documents,
- "task": task
- }
-
- # Enhanced execution with auto-correction
- result, attempts_info = await self._execute_with_auto_correction(
- code_to_execute,
- requirements,
- execution_context,
- prompt # Original prompt/message
- )
-
- # Create output documents based on execution result and output specifications
- if result.get("success", False):
- # Code execution successful
- output = result.get("output", "")
- execution_result = result.get("result")
- logger.info("Code executed successfully")
-
- # Determine output type of the result
- result_docs = self._generate_result_documents(
- attempts_info[-1]["code"], # Last successful code
- output,
- execution_result,
- output_specs
- )
-
- # Add result documents
- generated_documents.extend(result_docs)
-
- # Create feedback for successful execution
- feedback = f"I successfully executed the code and generated {len(result_docs)} output files."
- if attempts_info and len(attempts_info) > 1:
- feedback += f" (This required {len(attempts_info)-1} correction attempts)"
-
- else:
- # Code execution failed after all attempts
- error = result.get("error", "Unknown error")
- logger.error(f"Error in code execution after all correction attempts: {error}")
-
- # Add error log as additional document
- error_doc = {
- "label": "execution_error.txt",
- "content": f"Execution error:\n\n{error}"
- }
- generated_documents.append(error_doc)
-
- # Create feedback for failed execution
- feedback = f"An error occurred during code execution after {len(attempts_info)} correction attempts."
-
- # If no specific outputs requested, create standard outputs
- if not output_specs and result.get("success", False):
- # Add standard output document
- output_doc = {
- "label": "execution_output.txt",
- "content": output
- }
- generated_documents.append(output_doc)
-
- # If a result is available, also add as JSON document
- if execution_result:
- result_json = json.dumps(execution_result, indent=2) if isinstance(execution_result, (dict, list)) else str(execution_result)
- result_doc = {
- "label": "execution_result.json",
- "content": result_json
- }
- generated_documents.append(result_doc)
-
- return {
- "feedback": feedback,
- "documents": generated_documents
- }
-
- except Exception as e:
- error_msg = f"Error during processing by the Coder agent: {str(e)}"
- logger.error(error_msg)
- return {
- "feedback": f"An error occurred during code processing: {str(e)}",
- "documents": []
- }
-
- def _extract_document_context(self, documents: List[Dict[str, Any]]) -> str:
- """
- Extract context from input documents for code generation.
-
- Args:
- documents: List of document objects
-
- Returns:
- Extracted context as text
- """
- context_parts = []
-
- for doc in documents:
- doc_name = doc.get("name", "Unnamed document")
- context_parts.append(f"--- {doc_name} ---")
-
- for content in doc.get("contents", []):
- if content.get("metadata", {}).get("is_text", False):
- context_parts.append(content.get("data", ""))
-
- return "\n\n".join(context_parts)
-
- def _generate_result_documents(self, code: str, output: str, execution_result: Any,
- output_specs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """
- Generate output documents based on execution results and specifications.
-
- Args:
- code: Executed code
- output: Text output of the execution
- execution_result: Result object from execution
- output_specs: Output specifications
-
- Returns:
- List of generated document objects
- """
- documents = []
-
- # If no specific outputs requested
- if not output_specs:
- return documents
-
- # Generate appropriate document for each requested output
- for spec in output_specs:
- output_label = spec.get("label", "")
- output_description = spec.get("description", "")
-
- # Determine output type based on file extension
- format_type = self._determine_format_type(output_label)
-
- # Generate document content based on format and output
- if "code" in output_label.lower() or format_type in ["py", "js", "html", "css"]:
- # Code document
- documents.append({
- "label": output_label,
- "content": code
- })
- elif "output" in output_label.lower() or format_type == "txt":
- # Output document
- documents.append({
- "label": output_label,
- "content": output
- })
- elif format_type in ["json", "yml", "yaml"] and execution_result:
- # JSON result document
- if isinstance(execution_result, (dict, list)):
- content = json.dumps(execution_result, indent=2)
- else:
- content = str(execution_result)
-
- documents.append({
- "label": output_label,
- "content": content
- })
- else:
- # Generic result document (fallback)
- result_str = ""
- if execution_result:
- if isinstance(execution_result, (dict, list)):
- result_str = json.dumps(execution_result, indent=2)
- else:
- result_str = str(execution_result)
-
- documents.append({
- "label": output_label,
- "content": f"Code output:\n\n{output}\n\nResult:\n\n{result_str}"
- })
-
- return documents
-
- def _determine_format_type(self, output_label: str) -> str:
- """
- Determine the format type based on the filename.
-
- Args:
- output_label: Output filename
-
- Returns:
- Format type (py, js, json, txt, etc.)
- """
- if not '.' in output_label:
- return "txt" # Default format
-
- extension = output_label.split('.')[-1].lower()
- return extension
-
- async def _execute_with_auto_correction(
- self,
- initial_code: str,
- requirements: List[str],
- context: Dict[str, Any],
- original_prompt: str
- ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
- """
- Execute code with automatic error correction and retry attempts.
-
- Args:
- initial_code: The initial Python code
- requirements: List of required packages
- context: Additional context for execution
- original_prompt: The original user request/prompt
-
- Returns:
- Tuple of (final execution result, list of attempt info dictionaries)
- """
- # Initialize tracking data
- current_code = initial_code
- current_requirements = requirements.copy() if requirements else []
- attempts_info = []
-
- # Execute with correction loop
- for attempt in range(1, self.max_correction_attempts + 1):
- if attempt == 1:
- logger.info(f"Executing code (attempt {attempt}/{self.max_correction_attempts})")
- else:
- logger.info(f"Executing corrected code (attempt {attempt}/{self.max_correction_attempts})")
-
- # Execute current code version
- result = await self._execute_code(current_code, current_requirements, context)
-
- # Record attempt information
- attempts_info.append({
- "attempt": attempt,
- "code": current_code,
- "error": result.get("error", ""),
- "success": result.get("success", False)
- })
-
- # Check if execution was successful
- if result.get("success", False):
- # Success! Return result and attempt info
- return result, attempts_info
-
- # Failed execution - check if max attempt limit reached
- if attempt >= self.max_correction_attempts:
- logger.warning(f"Maximum correction attempts ({self.max_correction_attempts}) reached")
- break
-
- # Correct code based on the error
- error_message = result.get("error", "Unknown error")
-
- logger.info(f"Attempting to fix code error: {error_message[:200]}...")
-
- # Generate corrected code
- corrected_code, new_requirements = await self._generate_code_correction(
- current_code,
- error_message,
- original_prompt,
- current_requirements
- )
-
- # Update for next attempt
- if corrected_code:
- current_code = corrected_code
-
- # Add new requirements
- if new_requirements:
- for req in new_requirements:
- if req not in current_requirements:
- current_requirements.append(req)
- logger.info(f"Added new requirement: {req}")
- else:
- # Correction couldn't be generated, end loop
- logger.warning("Couldn't generate code correction")
- break
-
- # If we reach here, all attempts failed - return last result and attempt info
- return result, attempts_info
-
- async def _generate_code_correction(
- self,
- code: str,
- error_message: str,
- original_prompt: str,
- current_requirements: List[str] = None
- ) -> Tuple[str, List[str]]:
- """
- Generate a corrected version of code based on error messages.
-
- Args:
- code: The code that generated errors
- error_message: The error message to fix
- original_prompt: The original task/requirements
- current_requirements: List of currently required packages
-
- Returns:
- Tuple of (corrected code, new requirements list)
- """
- try:
- # Create detailed prompt for code correction
- correction_prompt = f"""You need to fix an error in Python code. The code was written for this task:
-
-ORIGINAL TASK:
-{original_prompt}
-
-CURRENT CODE:
-```python
-{code}
-```
-
-ERROR MESSAGE:
-```
-{error_message}
-```
-
-CURRENT REQUIREMENTS: {', '.join(current_requirements) if current_requirements else "None"}
-
-Your task is to analyze the error and provide a corrected version of the code.
-Focus specifically on fixing the error while maintaining the original functionality.
-
-Common fixes include:
-- Fixing syntax errors (missing parentheses, indentation, etc.)
-- Solving import errors by adding appropriate requirements
-- Correcting file paths or handling "file not found" errors
-- Adding error handling for specific edge cases
-- Fixing logical errors in the code
-
-FORMATTING GUIDELINES:
-1. Provide ONLY the complete corrected Python code WITHOUT explanations
-2. Do NOT use code block markers like ```python or ```
-3. Do NOT explain what the code does before or after
-4. Do NOT add any text that isn't valid Python code
-5. Start your answer directly with valid Python code
-6. End your answer with valid Python code
-
-If you need to add new required packages, place them in a specially formatted comment at the beginning of your code as follows:
-# REQUIREMENTS: package1,package2,package3
-
-Your entire answer must be valid Python that can be executed without modifications.
-"""
-
- # Create messages for API
- messages = [
- {"role": "system", "content": "You are a Python debugging expert. You provide ONLY clean, error-free Python code, without explanations, markdown formatting, or text that isn't code."},
- {"role": "user", "content": correction_prompt}
- ]
-
- # Call API with very low temperature for deterministic corrections
- generated_content = await self.ai_service.call_api(
- messages,
- temperature=0.1
- )
-
- # Clean up the generated content to ensure it's only valid Python code
- fixed_code = self._clean_code(generated_content)
-
- # Extract requirements from special comment at beginning of code
- new_requirements = []
- for line in fixed_code.split('\n'):
- if line.strip().startswith("# REQUIREMENTS:"):
- req_str = line.replace("# REQUIREMENTS:", "").strip()
- new_requirements = [r.strip() for r in req_str.split(',') if r.strip()]
- break
-
- return fixed_code, new_requirements
-
- except Exception as e:
- logging.error(f"Error generating code correction: {str(e)}")
- # Return None to indicate failure
- return None, []
-
- def _clean_code(self, code: str) -> str:
- """
- Clean code by removing markdown code block markers and other formatting artifacts.
-
- Args:
- code: The code string to clean
-
- Returns:
- Cleaned code string
- """
- # Remove code block markers at beginning/end
- code = re.sub(r'^```(?:python)?\s*', '', code)
- code = re.sub(r'```\s*$', '', code)
-
- # Process lines in reverse order to start from the end
- lines = code.split('\n')
- clean_lines = []
- in_trailing_markdown = False
-
- for line in reversed(lines):
- stripped = line.strip()
-
- # Check if this line contains only backticks (``` or ` or ``)
- if re.match(r'^`{1,3}$', stripped):
- in_trailing_markdown = True
- continue
-
- # If we've reached actual code, no more trailing markdown consideration
- if stripped and not in_trailing_markdown:
- in_trailing_markdown = False
-
- # Add this line if it's not part of trailing markdown
- if not in_trailing_markdown:
- clean_lines.insert(0, line)
-
- # Rejoin lines
- clean_code = '\n'.join(clean_lines)
-
- # Final cleanup for any remaining backticks
- clean_code = re.sub(r'`{1,3}\s*', '', clean_code)
-
- return clean_code.strip()
-
- async def _generate_code_from_prompt(self, prompt: str, document_context: str) -> Tuple[str, List[str]]:
- """
- Generate Python code from a prompt using the AI service.
-
- Args:
- prompt: The prompt to generate code from
- document_context: Context extracted from documents
-
- Returns:
- Tuple of (generated Python code, required packages)
- """
- try:
- # Prepare prompt for code generation
- ai_prompt = f"""Generate Python code to solve the following task:
-
-TASK:
-{prompt}
-
-PROVIDED CONTEXT:
-{document_context if document_context else "No additional context available."}
-
-IMPORTANT REQUIREMENTS:
-1. Your code MUST define a 'result' variable to store the final result.
-2. At the end of your script, the result variable should be output.
-3. Make your 'result' variable a dictionary or other JSON-serializable data structure containing all relevant outputs.
-4. Comment your code well to explain important operations.
-5. Make your code complete and self-contained.
-6. Add appropriate error handling.
-
-FORMATTING INSTRUCTIONS:
-- Return ONLY the Python code, WITHOUT introduction, explanation, or conclusion text
-- Do NOT use code block markers like ```python or ```
-- Do NOT explain what the code does before or after
-- Do NOT add any text that isn't valid Python code
-- Start your answer directly with valid Python code
-- End your answer with valid Python code
-
-For required packages, place them in a specially formatted comment at the beginning of your code in one line as follows:
-# REQUIREMENTS: pandas,numpy,matplotlib,requests
-
-Your entire answer must be valid Python that can be executed without modifications.
-"""
-
- # Create messages for API
- messages = [
- {"role": "system", "content": "You are a Python code generator who provides ONLY clean, executable Python code with no explanations, markdown formatting, or non-code text."},
- {"role": "user", "content": ai_prompt}
- ]
-
- # Call API
- logging.info(f"Calling AI API to generate code")
- generated_content = await self.ai_service.call_api(messages, temperature=self.ai_temperature)
-
- # Clean up the generated content to ensure it's only valid Python code
- code = self._clean_code(generated_content)
-
- # Extract requirements from special comment at beginning of code
- requirements = []
- for line in code.split('\n'):
- if line.strip().startswith("# REQUIREMENTS:"):
- req_str = line.replace("# REQUIREMENTS:", "").strip()
- requirements = [r.strip() for r in req_str.split(',') if r.strip()]
- break
-
- return code, requirements
-
- except Exception as e:
- logging.error(f"Error generating code with AI: {str(e)}")
- # Return basic error handling code and no requirements
- error_str = str(e).replace('"', '\\"')
- return f"""
-# Error in code generation
-print(f"An error occurred during code generation: {error_str}")
-# Return error result
-result = {{"error": "Code generation failed", "message": "{error_str}"}}
-""", []
-
- async def _execute_code(self, code: str, requirements: List[str] = None, context: Dict[str, Any] = None) -> Dict[str, Any]:
- """
- Execute Python code in an isolated environment.
-
- Args:
- code: The Python code to execute
- requirements: List of required packages
- context: Additional context for execution
-
- Returns:
- Result of code execution
- """
- # Use virtual code executor for isolated execution
- try:
- executor = SimpleCodeExecutor(
- timeout=self.executor_timeout,
- max_memory_mb=self.executor_memory_limit,
- requirements=requirements,
- ai_service=self.ai_service
- )
-
- # Prepare input data for the code
- input_data = {"context": context} if context else {}
-
- # Execute code
- result = executor.execute_code(code, input_data)
-
- # Clean up environment
- executor.cleanup()
-
- return result
-
- except Exception as e:
- error_message = f"Error during code execution: {str(e)}"
- logger.error(error_message)
-
- return {
- "success": False,
- "output": "",
- "error": error_message,
- "result": None
- }
-
-
-class SimpleCodeExecutor:
- """
- A simplified executor that runs Python code in isolated virtual environments.
- """
-
- def __init__(self,
- timeout: int = 30,
- max_memory_mb: int = 512,
- requirements: List[str] = None,
- ai_service = None):
- """
- Initialize the SimpleCodeExecutor.
-
- Args:
- timeout: Maximum execution time in seconds
- max_memory_mb: Maximum memory in MB
- requirements: List of packages to install
- ai_service: Optional - AI service for further processing
- """
- self.timeout = timeout
- self.max_memory_mb = max_memory_mb
- self.temp_dir = None
- self.requirements = requirements or []
- self.blocked_packages = [
- "cryptography", "flask", "django", "tornado", # Security risks
- "tensorflow", "pytorch", "scikit-learn" # Resource-intensive packages
- ]
- self.ai_service = ai_service
-
- def _create_venv(self) -> str:
- """Create a virtual environment and return the path."""
- # Create new environment
- venv_parent_dir = tempfile.mkdtemp(prefix="code_exec_")
- self.temp_dir = venv_parent_dir
- venv_path = os.path.join(venv_parent_dir, "venv")
-
- try:
- # Create virtual environment
- subprocess.run([sys.executable, "-m", "venv", venv_path],
- check=True,
- capture_output=True)
-
- return venv_path
- except subprocess.CalledProcessError as e:
- logger.error(f"Error creating virtual environment: {e}")
- raise RuntimeError(f"Virtual environment could not be created: {e}")
-
- def _get_python_executable(self, venv_path: str) -> str:
- """Return the path to the Python executable in the virtual environment."""
- if os.name == 'nt': # Windows
- return os.path.join(venv_path, "Scripts", "python.exe")
- else: # Unix/Linux
- return os.path.join(venv_path, "bin", "python")
-
- def execute_code(self, code: str, input_data: Dict[str, Any] = None) -> Dict[str, Any]:
- """
- Execute Python code in an isolated environment.
-
- Args:
- code: Python code to execute
- input_data: Optional input data for the code
-
- Returns:
- Dictionary with execution results
- """
- logger.info("Executing code in isolated environment")
-
- # Create virtual environment
- venv_path = self._create_venv()
-
- # Create file for the code
- code_id = uuid.uuid4().hex[:8]
- code_file = os.path.join(self.temp_dir, f"code_{code_id}.py")
-
- # Write code
- with open(code_file, "w", encoding="utf-8") as f:
- f.write(code)
-
- # Get Python executable
- python_executable = self._get_python_executable(venv_path)
- logger.info(f"Using Python executable: {python_executable}")
-
- # Execute code
- try:
- # Execute code from root directory
- working_dir = os.path.dirname(code_file)
- process = subprocess.run(
- [python_executable, code_file],
- timeout=self.timeout,
- capture_output=True,
- text=True,
- cwd=working_dir
- )
-
- # Process output
- stdout = process.stdout
- stderr = process.stderr
-
- # Get result from stdout if available
- result_data = None
- if process.returncode == 0 and stdout:
- try:
- # Look for the last line that could be JSON
- for line in reversed(stdout.strip().split('\n')):
- line = line.strip()
- if line and line[0] in '{[' and line[-1] in '}]':
- try:
- result_data = json.loads(line)
- # Use successfully parsed JSON result
- break
- except json.JSONDecodeError:
- # Not valid JSON, continue with next line
- continue
- except Exception as e:
- logger.warning(f"Error parsing result from stdout: {str(e)}")
-
- # Create result dictionary
- execution_result = {
- "success": process.returncode == 0,
- "output": stdout,
- "error": stderr if process.returncode != 0 else "",
- "result": result_data,
- "exit_code": process.returncode
- }
-
- except subprocess.TimeoutExpired:
- logger.error(f"Execution timed out after {self.timeout} seconds")
- execution_result = {
- "success": False,
- "output": "",
- "error": f"Execution timed out (timeout after {self.timeout} seconds)",
- "result": None,
- "exit_code": -1
- }
- except Exception as e:
- logger.error(f"Execution error: {str(e)}")
- execution_result = {
- "success": False,
- "output": "",
- "error": f"Execution error: {str(e)}",
- "result": None,
- "exit_code": -1
- }
-
- # Clean up temporary code file
- try:
- if os.path.exists(code_file):
- os.remove(code_file)
- except Exception as e:
- logger.warning(f"Error cleaning up temporary code file: {e}")
-
- return execution_result
-
- def cleanup(self):
- """Clean up temporary resources."""
- # Clean up temporary directory
- if self.temp_dir and os.path.exists(self.temp_dir):
- try:
- shutil.rmtree(self.temp_dir)
- logger.info(f"Temporary directory deleted: {self.temp_dir}")
- except Exception as e:
- logger.warning(f"Temporary directory {self.temp_dir} could not be deleted: {e}")
-
- def __del__(self):
- """Cleanup during garbage collection."""
- self.cleanup()
-
-
-# Factory function for the Coder agent
-def get_coder_agent():
- """
- Factory function that returns an instance of the Coder agent.
-
- Returns:
- An instance of the Coder agent
- """
- return AgentCoder()
\ No newline at end of file
diff --git a/modules/_backup_lucydom_interface copy.py b/modules/_backup_lucydom_interface copy.py
deleted file mode 100644
index f8607580..00000000
--- a/modules/_backup_lucydom_interface copy.py
+++ /dev/null
@@ -1,1183 +0,0 @@
-import logging
-import uuid
-from datetime import datetime
-from typing import Dict, Any, List, Optional, Union
-
-import importlib
-import hashlib
-
-from connectors.connector_db_json import DatabaseConnector
-from modules.configuration import APP_CONFIG
-
-logger = logging.getLogger(__name__)
-
-# Custom exceptions for file handling
-class FileError(Exception):
- """Base class for file handling exceptions."""
- pass
-
-class FileNotFoundError(FileError):
- """Exception raised when a file is not found."""
- pass
-
-class FileStorageError(FileError):
- """Exception raised when there's an error storing a file."""
- pass
-
-class FilePermissionError(FileError):
- """Exception raised when there's a permission issue with a file."""
- pass
-
-class FileDeletionError(FileError):
- """Exception raised when there's an error deleting a file."""
- pass
-
-
-class LucyDOMInterface:
- """
- Interface zur LucyDOM-Datenbank.
- Verwendet den JSON-Konnektor für den Datenzugriff.
- """
-
- def __init__(self, mandate_id: int, user_id: int):
- """
- Initialisiert das LucyDOM-Interface mit Mandanten- und Benutzerkontext.
-
- Args:
- mandate_id: ID des aktuellen Mandanten
- user_id: ID des aktuellen Benutzers
- """
- self.mandate_id = mandate_id
- self.user_id = user_id
-
- # Datenmodell-Modul importieren
- try:
- self.model_module = importlib.import_module("modules.lucydom_model")
- logger.info("lucydom_model erfolgreich importiert")
- except ImportError as e:
- logger.error(f"Fehler beim Importieren von lucydom_model: {e}")
- raise
-
- # Datenbank initialisieren, falls nötig
- self._initialize_database()
-
- def _initialize_database(self):
- """
- Initialisiert die Datenbank mit minimalen Objekten für den angemeldeten Benutzer im Mandanten, falls sie noch nicht existiert.
- Ohne gültigen Benutzer keine Initialisierung.
- Erstellt für jede im Datenmodell definierte Tabelle einen initialen Datensatz.
- """
- effective_mandate_id = self.mandate_id
- effective_user_id = self.user_id
- if effective_mandate_id is None or effective_user_id is None:
- #data available
- return
-
- self.db = DatabaseConnector(
- db_host=APP_CONFIG.get("DB_LUCYDOM_HOST"),
- db_database=APP_CONFIG.get("DB_LUCYDOM_DATABASE"),
- db_user=APP_CONFIG.get("DB_LUCYDOM_USER"),
- db_password=APP_CONFIG.get("DB_LUCYDOM_PASSWORD_SECRET"),
- mandate_id=self.mandate_id,
- user_id=self.user_id
- )
-
- # Initialisierung von Standard-Prompts für verschiedene Bereiche
- prompts = self.db.get_recordset("prompts")
- if not prompts:
- logger.info("Erstelle Standard-Prompts")
-
- # Standard-Prompts definieren
- standard_prompts = [
- {
- "mandate_id": effective_mandate_id,
- "user_id": effective_user_id,
- "content": "Recherchiere die aktuellen Markttrends und Entwicklungen im Bereich [THEMA]. Sammle Informationen zu führenden Unternehmen, innovativen Produkten oder Dienstleistungen und aktuellen Herausforderungen. Präsentiere die Ergebnisse in einer strukturierten Übersicht mit relevanten Daten und Quellen.",
- "name": "Web Research: Marktforschung"
- },
- {
- "mandate_id": effective_mandate_id,
- "user_id": effective_user_id,
- "content": "Analysiere den beigefügten Datensatz zu [THEMA] und identifiziere die wichtigsten Trends, Muster und Auffälligkeiten. Führe statistische Berechnungen durch, um deine Erkenntnisse zu untermauern. Stelle die Ergebnisse in einer klar strukturierten Analyse dar und ziehe relevante Schlussfolgerungen.",
- "name": "Analyse: Datenanalyse"
- },
- {
- "mandate_id": effective_mandate_id,
- "user_id": effective_user_id,
- "content": "Erstelle ein detailliertes Protokoll unserer Besprechung zum Thema [THEMA]. Erfasse alle besprochenen Punkte, getroffenen Entscheidungen und vereinbarten Maßnahmen. Strukturiere das Protokoll übersichtlich mit Tagesordnungspunkten, Teilnehmerliste und klaren Verantwortlichkeiten für die Follow-up-Aktionen.",
- "name": "Protokoll: Besprechungsprotokoll"
- },
- {
- "mandate_id": effective_mandate_id,
- "user_id": effective_user_id,
- "content": "Entwickle ein UI/UX-Designkonzept für [ANWENDUNG/WEBSITE]. Berücksichtige die Zielgruppe, Hauptfunktionen und die Markenidentität. Beschreibe die visuelle Gestaltung, Navigation, Interaktionsmuster und Informationsarchitektur. Erläutere, wie das Design die Benutzerfreundlichkeit und das Nutzererlebnis optimiert.",
- "name": "Design: UI/UX Design"
- }
- ]
-
- # Prompts erstellen
- for prompt_data in standard_prompts:
- created_prompt = self.db.record_create("prompts", prompt_data)
- logger.info(f"Prompt '{prompt_data.get('name', 'Standard')}' wurde erstellt mit ID {created_prompt['id']}")
-
-
- # Utilities
-
- def get_initial_id(self, table: str) -> Optional[int]:
- """
- Gibt die initiale ID für eine Tabelle zurück.
-
- Args:
- table: Name der Tabelle
-
- Returns:
- Die initiale ID oder None, wenn nicht vorhanden
- """
- return self.db.get_initial_id(table)
-
- def _get_current_timestamp(self) -> str:
- """Gibt den aktuellen Zeitstempel im ISO-Format zurück"""
- return datetime.now().isoformat()
-
-
- # Prompt-Methoden
-
- def get_all_prompts(self) -> List[Dict[str, Any]]:
- """Gibt alle Prompts des aktuellen Mandanten zurück"""
- return self.db.get_recordset("prompts")
-
- def get_prompt(self, prompt_id: int) -> Optional[Dict[str, Any]]:
- """Gibt einen Prompt anhand seiner ID zurück"""
- prompts = self.db.get_recordset("prompts", record_filter={"id": prompt_id})
- if prompts:
- return prompts[0]
- return None
-
- def create_prompt(self, content: str, name: str) -> Dict[str, Any]:
- """Erstellt einen neuen Prompt"""
- prompt_data = {
- "mandate_id": self.mandate_id,
- "user_id": self.user_id,
- "content": content,
- "name": name,
- "created_at": self._get_current_timestamp()
- }
-
- return self.db.record_create("prompts", prompt_data)
-
- def update_prompt(self, prompt_id: int, content: str = None, name: str = None) -> Dict[str, Any]:
- """
- Aktualisiert einen vorhandenen Prompt
-
- Args:
- prompt_id: ID des zu aktualisierenden Prompts
- content: Neuer Inhalt des Prompts
-
- Returns:
- Das aktualisierte Prompt-Objekt
- """
- # Prüfen, ob der Prompt existiert
- prompt = self.get_prompt(prompt_id)
- if not prompt:
- return None
-
- # Daten für die Aktualisierung vorbereiten
- prompt_data = {}
-
- if content is not None:
- prompt_data["content"] = content
- if name is not None:
- prompt_data["name"] = name
-
- # Prompt aktualisieren
- return self.db.record_modify("prompts", prompt_id, prompt_data)
-
- def delete_prompt(self, prompt_id: int) -> bool:
- """
- Löscht einen Prompt aus der Datenbank
-
- Args:
- prompt_id: ID des zu löschenden Prompts
-
- Returns:
- True, wenn der Prompt erfolgreich gelöscht wurde, sonst False
- """
- return self.db.record_delete("prompts", prompt_id)
-
-
- # File Utilities
-
- def calculate_file_hash(self, file_content: bytes) -> str:
- """Berechnet einen SHA-256-Hash für den Dateiinhalt"""
- return hashlib.sha256(file_content).hexdigest()
-
- def check_for_duplicate_file(self, file_hash: str) -> Optional[Dict[str, Any]]:
- """Prüft, ob bereits eine Datei mit demselben Hash existiert"""
- files = self.db.get_recordset("files", record_filter={"file_hash": file_hash})
- if files:
- return files[0]
- return None
-
- def get_mime_type(self, filename: str) -> str:
- """Ermittelt den MIME-Typ basierend auf der Dateiendung"""
- import os
- ext = os.path.splitext(filename)[1].lower()[1:]
- extension_to_mime = {
- "pdf": "application/pdf",
- "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- "doc": "application/msword",
- "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
- "xls": "application/vnd.ms-excel",
- "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
- "ppt": "application/vnd.ms-powerpoint",
- "csv": "text/csv",
- "txt": "text/plain",
- "json": "application/json",
- "xml": "application/xml",
- "html": "text/html",
- "htm": "text/html",
- "jpg": "image/jpeg",
- "jpeg": "image/jpeg",
- "png": "image/png",
- "gif": "image/gif",
- "webp": "image/webp",
- "svg": "image/svg+xml",
- "py": "text/x-python",
- "js": "application/javascript",
- "css": "text/css"
- }
- return extension_to_mime.get(ext.lower(), "application/octet-stream")
-
-
- # File Methoden - Metadaten-basierte Operationen
-
- def get_all_files(self) -> List[Dict[str, Any]]:
- """
- Gibt alle Dateien des aktuellen Mandanten zurück ohne Binärdaten.
-
- Returns:
- Liste von FileItem-Objekten ohne Binärdaten
- """
- files = self.db.get_recordset("files")
- return files
-
- def get_file(self, file_id: int) -> Optional[Dict[str, Any]]:
- """
- Gibt eine Datei anhand ihrer ID zurück, ohne Binärdaten.
-
- Args:
- file_id: ID der gesuchten Datei
-
- Returns:
- FileItem ohne Binärdaten oder None, wenn nicht gefunden
- """
- files = self.db.get_recordset("files", record_filter={"id": file_id})
- if files:
- return files[0]
- return None
-
- def create_file(self, name: str, mime_type: str, size: int = None, file_hash: str = None) -> Dict[str, Any]:
- """
- Erstellt einen neuen Dateieintrag in der Datenbank ohne Inhalt.
- Der eigentliche Dateiinhalt wird separat in der FileData-Tabelle gespeichert.
-
- Args:
- name: Name der Datei
- mime_type: MIME-Typ der Datei
- size: Größe der Datei in Bytes
- file_hash: Hash-Wert der Datei für Deduplizierung
-
- Returns:
- Das erstellte FileItem-Objekt
- """
- file_data = {
- "mandate_id": self.mandate_id,
- "user_id": self.user_id,
- "name": name,
- "mime_type": mime_type,
- "size": size,
- "file_hash": file_hash,
- "creation_date": self._get_current_timestamp()
- }
- return self.db.record_create("files", file_data)
-
- def update_file(self, file_id: int, update_data: Dict[str, Any]) -> Dict[str, Any]:
- """
- Aktualisiert die Metadaten einer vorhandenen Datei ohne die Binärdaten zu beeinflussen.
-
- Args:
- file_id: ID der zu aktualisierenden Datei
- update_data: Dictionary mit zu aktualisierenden Feldern
-
- Returns:
- Das aktualisierte FileItem-Objekt
- """
- # Prüfen, ob die Datei existiert
- file = self.get_file(file_id)
- if not file:
- raise FileNotFoundError(f"Datei mit ID {file_id} nicht gefunden")
-
- # Datei aktualisieren
- return self.db.record_modify("files", file_id, update_data)
-
- def delete_file(self, file_id: int) -> bool:
- """
- Löscht eine Datei aus der Datenbank (Metadaten und Inhalt).
-
- Args:
- file_id: ID der Datei
-
- Returns:
- True bei Erfolg, False bei Fehler
- """
- try:
- # Suche die Datei in der Datenbank
- file = self.get_file(file_id)
-
- if not file:
- raise FileNotFoundError(f"Datei mit ID {file_id} nicht gefunden")
-
- # Prüfe, ob die Datei zum aktuellen Mandanten gehört
- if file.get("mandate_id") != self.mandate_id:
- raise FilePermissionError(f"Keine Berechtigung zum Löschen der Datei {file_id}")
-
- # Check for other references to this file (by hash)
- file_hash = file.get("file_hash")
- if file_hash:
- other_references = [f for f in self.db.get_recordset("files", record_filter={"file_hash": file_hash})
- if f.get("id") != file_id]
-
- # If other files reference this content, only delete the database entry for FileItem
- if other_references:
- logger.info(f"Andere Referenzen auf den Dateiinhalt gefunden, nur FileItem wird gelöscht: {file_id}")
- else:
- # Lösche auch den Dateiinhalt in der FileData-Tabelle
- try:
- file_data_entries = self.db.get_recordset("file_data", record_filter={"id": file_id})
- if file_data_entries:
- self.db.record_delete("file_data", file_id)
- logger.info(f"FileData für Datei {file_id} gelöscht")
- except Exception as e:
- logger.warning(f"Fehler beim Löschen des FileData für Datei {file_id}: {str(e)}")
-
- # Lösche den FileItem-Eintrag
- return self.db.record_delete("files", file_id)
-
- except FileNotFoundError as e:
- # Pass through FileNotFoundError
- raise
- except FilePermissionError as e:
- # Pass through FilePermissionError
- raise
- except Exception as e:
- logger.error(f"Fehler beim Löschen der Datei {file_id}: {str(e)}")
- raise FileDeletionError(f"Fehler beim Löschen der Datei: {str(e)}")
-
-
- # FileData Methoden - Binärdaten-basierte Operationen
-
- def create_file_data(self, file_id: int, data: bytes) -> bool:
- """
- Speichert die Binärdaten einer Datei in der Datenbank als Base64-String.
-
- Args:
- file_id: ID der zugehörigen Datei
- data: Binärdaten
-
- Returns:
- True bei Erfolg, False bei Fehler
- """
- try:
- import base64
-
- # Convert binary data to base64 string
- if isinstance(data, bytes):
- encoded_data = base64.b64encode(data).decode('utf-8')
- logger.debug(f"Converted {len(data)} bytes to base64 string of length {len(encoded_data)}")
- else:
- logger.warning(f"Data is not bytes, but {type(data)}. Attempting to handle...")
- # Try to convert to bytes if it's not already
- if isinstance(data, str):
- # Check if it might already be base64 encoded
- try:
- # See if it's valid base64
- base64.b64decode(data)
- # If no error, assume it's already encoded
- encoded_data = data
- logger.info(f"Data appears to be already base64 encoded, using as is")
- except:
- # Not base64, so encode the string as bytes then to base64
- encoded_data = base64.b64encode(data.encode('utf-8')).decode('utf-8')
- logger.info(f"Converted string to base64")
- else:
- # For other types, convert to string first
- encoded_data = base64.b64encode(str(data).encode('utf-8')).decode('utf-8')
- logger.warning(f"Converted non-standard type to base64")
-
- # Create the file_data record with encoded data
- file_data = {
- "id": file_id,
- "data": encoded_data
- }
-
- self.db.record_create("file_data", file_data)
- logger.info(f"Successfully stored encoded data for file {file_id}")
- return True
- except Exception as e:
- logger.error(f"Fehler beim Speichern der Binärdaten für Datei {file_id}: {str(e)}")
- return False
-
- def get_file_data(self, file_id: int) -> Optional[bytes]:
- """
- Gibt die Binärdaten einer Datei zurück.
- Konvertiert Base64-String aus der Datenbank zurück zu bytes.
-
- Args:
- file_id: ID der Datei
-
- Returns:
- Binärdaten oder None, wenn nicht gefunden
- """
- import base64
-
- file_data_entries = self.db.get_recordset("file_data", record_filter={"id": file_id})
- if file_data_entries and "data" in file_data_entries[0]:
- encoded_data = file_data_entries[0]["data"]
-
- try:
- # Check if it's a string (most likely base64)
- if isinstance(encoded_data, str):
- try:
- # Try to decode base64
- binary_data = base64.b64decode(encoded_data)
- logger.debug(f"Successfully decoded base64 string to {len(binary_data)} bytes")
- return binary_data
- except Exception as e:
- logger.error(f"Failed to decode base64 data: {str(e)}")
- # If it's not valid base64, return as bytes
- return encoded_data.encode('utf-8')
- # If it's already bytes (shouldn't happen with model change)
- elif isinstance(encoded_data, bytes):
- logger.warning(f"Data was already bytes, no conversion needed")
- return encoded_data
- else:
- logger.error(f"Unexpected data type in database: {type(encoded_data)}")
- return None
- except Exception as e:
- logger.error(f"Error processing file data: {str(e)}")
- return None
- else:
- logger.warning(f"No data found for file ID {file_id}")
- return None
-
- def update_file_data(self, file_id: int, data: Union[bytes, str]) -> bool:
- """
- Aktualisiert die Binärdaten einer Datei in der Datenbank.
- Konvertiert bytes zu Base64-String für die Speicherung.
-
- Args:
- file_id: ID der Datei
- data: Neue Binärdaten oder kodierte Daten
-
- Returns:
- True bei Erfolg, False bei Fehler
- """
- try:
- import base64
-
- # Convert data to base64 string if it's bytes
- if isinstance(data, bytes):
- encoded_data = base64.b64encode(data).decode('utf-8')
- logger.debug(f"Converted {len(data)} bytes to base64 string")
- elif isinstance(data, str):
- # Check if it might already be base64 encoded
- try:
- # See if it's valid base64
- base64.b64decode(data)
- # If no error, assume it's already encoded
- encoded_data = data
- logger.debug(f"Data appears to be already base64 encoded, using as is")
- except:
- # Not base64, so encode the string as bytes then to base64
- encoded_data = base64.b64encode(data.encode('utf-8')).decode('utf-8')
- logger.debug(f"Converted string to base64")
- else:
- # For other types, convert to string first
- encoded_data = base64.b64encode(str(data).encode('utf-8')).decode('utf-8')
- logger.warning(f"Converted non-standard type to base64")
-
- # Check if a record already exists
- file_data_entries = self.db.get_recordset("file_data", record_filter={"id": file_id})
-
- if file_data_entries:
- # Update the existing record
- self.db.record_modify("file_data", file_id, {"data": encoded_data})
- logger.info(f"Updated existing file data for file ID {file_id}")
- else:
- # Create a new record
- file_data = {
- "id": file_id,
- "data": encoded_data
- }
- self.db.record_create("file_data", file_data)
- logger.info(f"Created new file data for file ID {file_id}")
-
- return True
- except Exception as e:
- logger.error(f"Fehler beim Aktualisieren der Binärdaten für Datei {file_id}: {str(e)}")
- return False
-
- def save_uploaded_file(self, file_content: bytes, file_name: str) -> Dict[str, Any]:
- """
- Speichert eine hochgeladene Datei in der Datenbank.
- Metadaten werden in der 'files'-Tabelle gespeichert,
- Binärdaten in der 'file_data'-Tabelle als Base64-String.
-
- Args:
- file_content: Binärdaten der Datei
- file_name: Name der Datei
-
- Returns:
- Dictionary mit Metadaten der gespeicherten Datei
- """
- try:
- # Debug: Log the start of the file upload process
- logger.info(f"Starting upload process for file: {file_name}")
-
- # Debug: Check if file_content is valid bytes
- if not isinstance(file_content, bytes):
- logger.error(f"Invalid file_content type: {type(file_content)}")
- raise ValueError(f"file_content must be bytes, got {type(file_content)}")
-
- # Calculate file hash for deduplication
- file_hash = self.calculate_file_hash(file_content)
- logger.debug(f"Calculated file hash: {file_hash}")
-
- # Check for duplicate
- existing_file = self.check_for_duplicate_file(file_hash)
- if existing_file:
- # Simply return the existing file metadata
- logger.info(f"Duplikat gefunden für {file_name}: {existing_file['id']}")
- return existing_file
-
- # MIME-Typ bestimmen
- mime_type = self.get_mime_type(file_name)
-
- # Dateigröße bestimmen
- file_size = len(file_content)
-
- # 1. Speichere Metadaten in der 'files'-Tabelle
- logger.info(f"Saving file metadata to database for file: {file_name}")
- db_file = self.create_file(
- name=file_name,
- mime_type=mime_type,
- size=file_size,
- file_hash=file_hash
- )
-
- # 2. Speichere Binärdaten als Base64-String in der 'file_data'-Tabelle
- logger.info(f"Saving file content to database for file: {file_name}")
- self.create_file_data(db_file["id"], file_content)
-
- # Debug: Verify database record was created
- if not db_file:
- logger.warning(f"Database record for file {file_name} was not created properly")
- else:
- logger.info(f"Database record created for file {file_name}")
-
- logger.info(f"File upload process completed for: {file_name}")
- return db_file
-
- except Exception as e:
- logger.error(f"Error in save_uploaded_file for {file_name}: {str(e)}", exc_info=True)
- raise FileStorageError(f"Fehler beim Speichern der Datei: {str(e)}")
-
- def download_file(self, file_id: int) -> Optional[Dict[str, Any]]:
- """
- Gibt eine Datei zum Download zurück, einschließlich Binärdaten.
-
- Args:
- file_id: ID der Datei
-
- Returns:
- Dictionary mit Dateidaten und -metadaten oder None, wenn nicht gefunden
- """
- try:
- # 1. Metadaten aus der 'files'-Tabelle holen
- file = self.get_file(file_id)
-
- if not file:
- raise FileNotFoundError(f"Datei mit ID {file_id} nicht gefunden")
-
- # 2. Binärdaten aus der 'file_data'-Tabelle holen
- file_content = self.get_file_data(file_id)
-
- if file_content is None:
- raise FileNotFoundError(f"Binärdaten für Datei mit ID {file_id} nicht gefunden")
-
- return {
- "id": file_id,
- "name": file.get("name", f"file_{file_id}"),
- "content_type": file.get("mime_type", "application/octet-stream"),
- "size": file.get("size", len(file_content)),
- "content": file_content
- }
- except FileNotFoundError as e:
- # Re-raise FileNotFoundError as is
- raise
- except Exception as e:
- logger.error(f"Fehler beim Herunterladen der Datei {file_id}: {str(e)}")
- raise FileError(f"Fehler beim Herunterladen der Datei: {str(e)}")
-
-
- # Workflow Methoden
-
- def get_all_workflows(self) -> List[Dict[str, Any]]:
- """Gibt alle Workflows des aktuellen Mandanten zurück"""
- return self.db.get_recordset("workflows")
-
- def get_workflows_by_user(self, user_id: int) -> List[Dict[str, Any]]:
- """Gibt alle Workflows eines Benutzers zurück"""
- return self.db.get_recordset("workflows", record_filter={"user_id": user_id})
-
- def get_workflow(self, workflow_id: str) -> Optional[Dict[str, Any]]:
- """Gibt einen Workflow anhand seiner ID zurück"""
- workflows = self.db.get_recordset("workflows", record_filter={"id": workflow_id})
- if workflows:
- return workflows[0]
- return None
-
- def create_workflow(self, workflow_data: Dict[str, Any]) -> Dict[str, Any]:
- """Erstellt einen neuen Workflow in der Datenbank"""
- # Stellen Sie sicher, dass mandate_id und user_id gesetzt sind
- if "mandate_id" not in workflow_data:
- workflow_data["mandate_id"] = self.mandate_id
-
- if "user_id" not in workflow_data:
- workflow_data["user_id"] = self.user_id
-
- # Zeitstempel setzen, falls nicht vorhanden
- current_time = self._get_current_timestamp()
- if "started_at" not in workflow_data:
- workflow_data["started_at"] = current_time
-
- if "last_activity" not in workflow_data:
- workflow_data["last_activity"] = current_time
-
- # Stelle sicher, dass last_message_id gesetzt ist, falls nicht vorhanden
- if "last_message_id" not in workflow_data:
- workflow_data["last_message_id"] = ""
-
- return self.db.record_create("workflows", workflow_data)
-
- def update_workflow(self, workflow_id: str, workflow_data: Dict[str, Any]) -> Dict[str, Any]:
- """
- Aktualisiert einen vorhandenen Workflow.
-
- Args:
- workflow_id: ID des zu aktualisierenden Workflows
- workflow_data: Neue Daten für den Workflow
-
- Returns:
- Das aktualisierte Workflow-Objekt
- """
- # Prüfen, ob der Workflow existiert
- workflow = self.get_workflow(workflow_id)
- if not workflow:
- return None
-
- # Aktualisierungszeit setzen
- workflow_data["last_activity"] = self._get_current_timestamp()
-
- # Workflow aktualisieren
- return self.db.record_modify("workflows", workflow_id, workflow_data)
-
- def delete_workflow(self, workflow_id: str) -> bool:
- """
- Löscht einen Workflow aus der Datenbank.
-
- Args:
- workflow_id: ID des zu löschenden Workflows
-
- Returns:
- True bei Erfolg, False wenn der Workflow nicht existiert
- """
- # Prüfen, ob der Workflow existiert
- workflow = self.get_workflow(workflow_id)
- if not workflow:
- return False
-
- # Prüfen, ob der Benutzer der Eigentümer ist oder Admin-Rechte hat
- if workflow.get("user_id") != self.user_id:
- # Hier könnte eine Prüfung auf Admin-Rechte erfolgen
- return False
-
- # Workflow löschen
- return self.db.record_delete("workflows", workflow_id)
-
-
- # Workflow Messages
-
- def get_workflow_messages(self, workflow_id: str) -> List[Dict[str, Any]]:
- """Gibt alle Nachrichten eines Workflows zurück"""
- return self.db.get_recordset("workflow_messages", record_filter={"workflow_id": workflow_id})
-
- def create_workflow_message(self, message_data: Dict[str, Any]) -> Dict[str, Any]:
- """Erstellt eine neue Nachricht für einen Workflow
-
- Args:
- message_data: Die Nachrichtendaten
-
- Returns:
- Die erstellte Nachricht oder None bei Fehler
- """
- try:
- # Check if required fields are present
- required_fields = ["id", "workflow_id"]
- for field in required_fields:
- if field not in message_data:
- logger.error(f"Pflichtfeld '{field}' fehlt in message_data")
- raise ValueError(f"Pflichtfeld '{field}' fehlt in den Nachrichtendaten")
-
- # Validate that ID is not None
- if message_data["id"] is None:
- message_data["id"] = f"msg_{uuid.uuid4()}"
- logger.warning(f"Automatisch generierte ID für Workflow-Nachricht: {message_data['id']}")
-
- # Stellen Sie sicher, dass die benötigten Felder vorhanden sind
- if "started_at" not in message_data and "created_at" not in message_data:
- message_data["started_at"] = self._get_current_timestamp()
-
- # Wenn "created_at" vorhanden ist, übertrage es nach "started_at"
- if "created_at" in message_data and "started_at" not in message_data:
- message_data["started_at"] = message_data["created_at"]
- del message_data["created_at"]
-
- # Status setzen, falls nicht vorhanden
- if "status" not in message_data:
- message_data["status"] = "completed"
-
- # Sequenznummer setzen, falls nicht vorhanden
- if "sequence_no" not in message_data:
- # Hole aktuelle Nachrichten, um die nächste Sequenznummer zu bestimmen
- existing_messages = self.get_workflow_messages(message_data["workflow_id"])
- message_data["sequence_no"] = len(existing_messages) + 1
-
- # Debug-Log für die zu erstellenden Daten
- logger.debug(f"Erstelle Workflow-Nachricht mit Daten: {message_data}")
-
- return self.db.record_create("workflow_messages", message_data)
- except Exception as e:
- logger.error(f"Fehler beim Erstellen der Workflow-Nachricht: {str(e)}")
- # Return None instead of raising to avoid cascading failures
- return None
-
- def update_workflow_message(self, message_id: str, message_data: Dict[str, Any]) -> Dict[str, Any]:
- """
- Aktualisiert eine bestehende Workflow-Nachricht in der Datenbank
- with improved document handling.
-
- Args:
- message_id: ID der Nachricht
- message_data: Zu aktualisierende Daten
-
- Returns:
- Das aktualisierte Nachrichtenobjekt oder None bei Fehler
- """
- try:
- # Print debug info
- print(f"Updating message {message_id} in database")
-
- # Ensure message_id is provided
- if not message_id:
- logger.error("No message_id provided for update_workflow_message")
- raise ValueError("message_id cannot be empty")
-
- # Check if message exists in database
- messages = self.db.get_recordset("workflow_messages", record_filter={"id": message_id})
- if not messages:
- logger.warning(f"Message with ID {message_id} does not exist in database")
-
- # If message doesn't exist but we have workflow_id, create it
- if "workflow_id" in message_data:
- logger.info(f"Creating new message with ID {message_id} for workflow {message_data.get('workflow_id')}")
- return self.db.record_create("workflow_messages", message_data)
- else:
- logger.error(f"Workflow ID missing for new message {message_id}")
- return None
-
- # Ensure documents array is handled properly
- if "documents" in message_data:
- logger.info(f"Message {message_id} has {len(message_data['documents'])} documents")
-
- # Make sure we're not storing huge content in the database
- # For each document, ensure content size is reasonable
- documents_to_store = []
- for doc in message_data["documents"]:
- doc_copy = doc.copy()
-
- # Process contents array if it exists
- if "contents" in doc_copy:
- # Ensure contents is not too large - limit text size
- for content in doc_copy["contents"]:
- if content.get("type") == "text" and "text" in content:
- text = content["text"]
- if len(text) > 1000: # Limit text preview to 1000 chars
- content["text"] = text[:1000] + "... [truncated]"
-
- documents_to_store.append(doc_copy)
-
- # Replace with the processed documents
- message_data["documents"] = documents_to_store
-
- # Log the update data size for debugging
- update_data_size = len(str(message_data))
- logger.debug(f"Update data size: {update_data_size} bytes")
-
- # Ensure ID is in the dataset
- if 'id' not in message_data:
- message_data['id'] = message_id
-
- # Konvertiere created_at zu started_at falls nötig
- if "created_at" in message_data and "started_at" not in message_data:
- message_data["started_at"] = message_data["created_at"]
- del message_data["created_at"]
-
- # Update the message
- updated_message = self.db.record_modify("workflow_messages", message_id, message_data)
- if updated_message:
- logger.info(f"Message {message_id} updated successfully")
- else:
- logger.warning(f"Failed to update message {message_id}")
-
- return updated_message
- except Exception as e:
- logger.error(f"Error updating message {message_id}: {str(e)}", exc_info=True)
- # Re-raise with full information
- raise ValueError(f"Error updating message {message_id}: {str(e)}")
-
- def delete_workflow_message(self, workflow_id: str, message_id: str) -> bool:
- """
- Löscht eine Nachricht aus einem Workflow in der Datenbank.
-
- Args:
- workflow_id: ID des zugehörigen Workflows
- message_id: ID der zu löschenden Nachricht
-
- Returns:
- True bei Erfolg, False bei Fehler
- """
- try:
- # Prüfen, ob die Nachricht existiert
- messages = self.get_workflow_messages(workflow_id)
- message = next((m for m in messages if m.get("id") == message_id), None)
-
- if not message:
- logger.warning(f"Nachricht {message_id} für Workflow {workflow_id} nicht gefunden")
- return False
-
- # Nachricht aus der Datenbank löschen
- return self.db.record_delete("workflow_messages", message_id)
- except Exception as e:
- logger.error(f"Fehler beim Löschen der Nachricht {message_id}: {str(e)}")
- return False
-
- def delete_file_from_message(self, workflow_id: str, message_id: str, file_id: int) -> bool:
- """
- Entfernt eine Dateireferenz aus einer Nachricht.
- Die Datei selbst wird nicht gelöscht, nur die Referenz in der Nachricht.
- Enhanced version with improved file matching.
-
- Args:
- workflow_id: ID des zugehörigen Workflows
- message_id: ID der Nachricht
- file_id: ID der zu entfernenden Datei
-
- Returns:
- True bei Erfolg, False bei Fehler
- """
- try:
- # Log operation
- logger.info(f"Removing file {file_id} from message {message_id} in workflow {workflow_id}")
-
- # Get all workflow messages
- all_messages = self.get_workflow_messages(workflow_id)
- logger.debug(f"Workflow {workflow_id} has {len(all_messages)} messages")
-
- # Try different approaches to find the message
- message = None
-
- # Exact match
- message = next((m for m in all_messages if m.get("id") == message_id), None)
-
- # Case-insensitive match
- if not message and isinstance(message_id, str):
- message = next((m for m in all_messages
- if isinstance(m.get("id"), str) and m.get("id").lower() == message_id.lower()), None)
-
- # Partial match (starts with)
- if not message and isinstance(message_id, str):
- message = next((m for m in all_messages
- if isinstance(m.get("id"), str) and m.get("id").startswith(message_id)), None)
-
- if not message:
- logger.warning(f"Message {message_id} not found in workflow {workflow_id}")
- return False
-
- # Log the found message
- logger.info(f"Found message: {message.get('id')}")
-
- # Check if message has documents
- if "documents" not in message or not message["documents"]:
- logger.warning(f"No documents in message {message_id}")
- return False
-
- # Log existing documents
- documents = message.get("documents", [])
- logger.debug(f"Message has {len(documents)} documents")
- for i, doc in enumerate(documents):
- doc_id = doc.get("id", "unknown")
- file_id_value = doc.get("file_id", "unknown")
- logger.debug(f"Document {i}: doc_id={doc_id}, file_id={file_id_value}")
-
- # Create a new list of documents without the one to delete
- updated_documents = []
- removed = False
-
- for doc in documents:
- doc_id = doc.get("id")
- file_id_value = doc.get("file_id")
-
- # Flexible matching approach
- should_remove = (
- (doc_id == file_id) or
- (file_id_value == file_id) or
- (isinstance(doc_id, str) and str(file_id) in doc_id) or
- (isinstance(file_id_value, str) and str(file_id) in file_id_value)
- )
-
- if should_remove:
- removed = True
- logger.info(f"Found file to remove: doc_id={doc_id}, file_id={file_id_value}")
- else:
- updated_documents.append(doc)
-
- if not removed:
- logger.warning(f"No matching file {file_id} found in message {message_id}")
- return False
-
- # Update message with modified documents array
- message_update = {
- "documents": updated_documents
- }
-
- # Apply the update directly to the database
- updated = self.db.record_modify("workflow_messages", message["id"], message_update)
-
- if updated:
- logger.info(f"Successfully removed file {file_id} from message {message_id}")
- return True
- else:
- logger.warning(f"Failed to update message {message_id} in database")
- return False
-
- except Exception as e:
- logger.error(f"Error removing file {file_id} from message {message_id}: {str(e)}")
- return False
-
-
- # Workflow Logs
-
- def get_workflow_logs(self, workflow_id: str) -> List[Dict[str, Any]]:
- """Gibt alle Log-Einträge eines Workflows zurück"""
- return self.db.get_recordset("workflow_logs", record_filter={"workflow_id": workflow_id})
-
- def create_workflow_log(self, log_data: Dict[str, Any]) -> Dict[str, Any]:
- """Erstellt einen neuen Log-Eintrag für einen Workflow"""
- # Stellen Sie sicher, dass die benötigten Felder vorhanden sind
- if "timestamp" not in log_data:
- log_data["timestamp"] = self._get_current_timestamp()
-
- return self.db.record_create("workflow_logs", log_data)
-
-
- # Workflow Management
-
- def save_workflow_state(self, workflow: Dict[str, Any], save_messages: bool = True, save_logs: bool = True) -> bool:
- """
- Speichert den kompletten Zustand eines Workflows in der Datenbank.
- Dies umfasst den Workflow selbst, Nachrichten und Logs.
-
- Args:
- workflow: Das vollständige Workflow-Objekt
- save_messages: Flag, ob Nachrichten gespeichert werden sollen
- save_logs: Flag, ob Logs gespeichert werden sollen
-
- Returns:
- True bei Erfolg, False bei Fehler
- """
- try:
- workflow_id = workflow.get("id")
- if not workflow_id:
- return False
-
- # Extrahiere nur die für die Datenbank relevanten Workflow-Felder
- workflow_db_data = {
- "id": workflow_id,
- "mandate_id": workflow.get("mandate_id", self.mandate_id),
- "user_id": workflow.get("user_id", self.user_id),
- "name": workflow.get("name", f"Workflow {workflow_id}"),
- "status": workflow.get("status", "unknown"),
- "started_at": workflow.get("started_at", self._get_current_timestamp()),
- "last_activity": workflow.get("last_activity", self._get_current_timestamp()),
- "last_message_id": workflow.get("last_message_id", ""),
- "data_stats": workflow.get("data_stats", {})
- }
-
- # Prüfen, ob der Workflow bereits existiert
- existing_workflow = self.get_workflow(workflow_id)
- if existing_workflow:
- self.update_workflow(workflow_id, workflow_db_data)
- else:
- self.create_workflow(workflow_db_data)
-
-
- # Nachrichten speichern
- if save_messages and "messages" in workflow:
- # Bestehende Nachrichten abrufen
- existing_messages = {msg["id"]: msg for msg in self.get_workflow_messages(workflow_id)}
-
- for message in workflow["messages"]:
- message_id = message.get("id")
- if not message_id:
- continue
-
- # Nur relevante Daten für die Datenbank extrahieren
- message_data = {
- "id": message_id,
- "workflow_id": workflow_id,
- "sequence_no": message.get("sequence_no", 0),
- "role": message.get("role", "unknown"),
- "content": message.get("content"),
- "agent_name": message.get("agent_name"),
- "status": message.get("status", "completed"),
- "started_at": message.get("started_at", self._get_current_timestamp()),
- "finished_at": message.get("finished_at"),
- "parent_message_id": message.get("parent_message_id"),
- # IMPORTANT: Include documents field to persist file attachments
- "documents": message.get("documents", [])
- }
-
- # Debug logging for documents
- doc_count = len(message.get("documents", []))
- if doc_count > 0:
- logger.info(f"Message {message_id} has {doc_count} documents to save")
-
- # Nachricht erstellen oder aktualisieren
- if message_id in existing_messages:
- self.db.record_modify("workflow_messages", message_id, message_data)
- else:
- self.db.record_create("workflow_messages", message_data)
-
- # Logs speichern
- if save_logs and "logs" in workflow:
- # Bestehende Logs abrufen
- existing_logs = {log["id"]: log for log in self.get_workflow_logs(workflow_id)}
-
- for log in workflow["logs"]:
- log_id = log.get("id")
- if not log_id:
- continue
-
- # Nur relevante Daten für die Datenbank extrahieren
- log_data = {
- "id": log_id,
- "workflow_id": workflow_id,
- "message": log.get("message", ""),
- "type": log.get("type", "info"),
- "timestamp": log.get("timestamp", self._get_current_timestamp()),
- "agent_id": log.get("agent_id"),
- "agent_name": log.get("agent_name")
- }
-
- # Log erstellen oder aktualisieren
- if log_id in existing_logs:
- self.db.record_modify("workflow_logs", log_id, log_data)
- else:
- self.db.record_create("workflow_logs", log_id, log_data)
-
- return True
- except Exception as e:
- logger.error(f"Fehler beim Speichern des Workflow-Zustands: {str(e)}")
- return False
-
- def load_workflow_state(self, workflow_id: str) -> Optional[Dict[str, Any]]:
- """
- Lädt den kompletten Zustand eines Workflows aus der Datenbank.
- Dies umfasst den Workflow selbst, Nachrichten und Logs.
-
- Args:
- workflow_id: ID des zu ladenden Workflows
-
- Returns:
- Das vollständige Workflow-Objekt oder None bei Fehler
- """
- try:
- # Basis-Workflow laden
- workflow = self.get_workflow(workflow_id)
- if not workflow:
- return None
-
- # Log the workflow base retrieval
- logger.debug(f"Loaded base workflow {workflow_id} from database")
-
- # Nachrichten laden
- messages = self.get_workflow_messages(workflow_id)
- # Nach Sequenznummer sortieren
- messages.sort(key=lambda x: x.get("sequence_no", 0))
-
- # Debug log for messages and document counts
- message_count = len(messages)
- logger.debug(f"Loaded {message_count} messages for workflow {workflow_id}")
-
- # Log document counts for each message
- for msg in messages:
- doc_count = len(msg.get("documents", []))
- if doc_count > 0:
- logger.info(f"Message {msg.get('id')} has {doc_count} documents loaded from database")
- # Log document details for debugging
- for i, doc in enumerate(msg.get("documents", [])):
- file_id = doc.get("file_id", "unknown")
- logger.debug(f"Document {i+1}: file_id={file_id}")
-
- # Logs laden
- logs = self.get_workflow_logs(workflow_id)
- # Nach Zeitstempel sortieren
- logs.sort(key=lambda x: x.get("timestamp", ""))
-
- # Vollständiges Workflow-Objekt zusammenbauen
- complete_workflow = workflow.copy()
- complete_workflow["messages"] = messages
- complete_workflow["logs"] = logs
-
- return complete_workflow
- except Exception as e:
- logger.error(f"Fehler beim Laden des Workflow-Zustands: {str(e)}")
- return None
-
-
-# Singleton-Factory für LucyDOMInterface-Instanzen pro Kontext
-_lucydom_interfaces = {}
-
-def get_lucydom_interface(mandate_id: int = 0, user_id: int = 0) -> LucyDOMInterface:
- """
- Gibt eine LucyDOMInterface-Instanz für den angegebenen Kontext zurück.
- Wiederverwendet bestehende Instanzen.
- """
- context_key = f"{mandate_id}_{user_id}"
- if context_key not in _lucydom_interfaces:
- _lucydom_interfaces[context_key] = LucyDOMInterface(mandate_id, user_id)
- return _lucydom_interfaces[context_key]
-
-# Init
-get_lucydom_interface()
\ No newline at end of file
diff --git a/modules/chat_agent_analyst.py b/modules/chat_agent_analyst.py
index d28cfb43..39bf6520 100644
--- a/modules/chat_agent_analyst.py
+++ b/modules/chat_agent_analyst.py
@@ -1,17 +1,14 @@
"""
Data analyst agent for analysis and interpretation of data.
-Optimized for the new task-based processing.
+Focuses on output-first design with AI-powered analysis.
"""
import logging
import json
-import re
-import uuid
import io
import base64
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any, List
import pandas as pd
-import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
@@ -20,26 +17,23 @@ from modules.chat_registry import AgentBase
logger = logging.getLogger(__name__)
class AgentAnalyst(AgentBase):
- """Agent for analysis and interpretation of data"""
+ """AI-driven agent for data analysis and visualization"""
def __init__(self):
"""Initialize the data analysis agent"""
super().__init__()
self.name = "analyst"
- self.description = "Analyzes and interprets data using statistical methods and visualizations"
+ self.description = "Analyzes data using AI-powered insights and visualizations, produce diagrams and visualizations"
self.capabilities = [
"data_analysis",
- "pattern_recognition",
"statistics",
"visualization",
- "data_interpretation"
+ "data_interpretation",
+ "report_generation"
]
- # Visualization settings
- self.plt_style = 'seaborn-v0_8-whitegrid'
- self.default_figsize = (10, 6)
- self.chart_dpi = 100
- plt.style.use(self.plt_style)
+ # Set default visualization settings
+ plt.style.use('seaborn-v0_8-whitegrid')
def set_dependencies(self, ai_service=None):
"""Set external dependencies for the agent."""
@@ -47,616 +41,645 @@ class AgentAnalyst(AgentBase):
async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
"""
- Process a standardized task structure and perform data analysis.
+ Process a task by focusing on required outputs and using AI to generate them.
Args:
- task: A dictionary containing:
- - task_id: Unique ID for this task
- - prompt: The main instruction for the agent
- - input_documents: List of documents to process
- - output_specifications: List of required output documents
- - context: Additional contextual information
-
+ task: Task dictionary with prompt, input_documents, output_specifications
+
Returns:
- A dictionary containing:
- - feedback: Text response explaining the analysis results
- - documents: List of created document objects
+ Dictionary with feedback and documents
"""
try:
- # Extract relevant task information
+ # Extract task information
prompt = task.get("prompt", "")
input_documents = task.get("input_documents", [])
output_specs = task.get("output_specifications", [])
- # Check if AI service is available
+ # Check AI service
if not self.ai_service:
- logger.error("No AI service configured for the Analyst agent")
return {
- "feedback": "The Analyst agent is not properly configured.",
+ "feedback": "The Analyst agent requires an AI service to function.",
"documents": []
}
- # Extract data from input documents
- data_frames, document_context = self._extract_data_from_documents(input_documents)
+ # Extract data from documents - focusing only on data_extracted
+ datasets, document_context = self._extract_data(input_documents)
- # Check if we have analyzable content
- have_analyzable_content = len(data_frames) > 0 or (prompt and len(prompt.strip()) > 10)
+ # Generate task analysis to understand what's needed
+ analysis_plan = await self._analyze_task(prompt, document_context, datasets, output_specs)
- if not have_analyzable_content:
- # Warning if no analyzable content available
- logger.warning("No analyzable content found")
- feedback = "I couldn't find any processable data in the provided documents."
- return {
- "feedback": feedback,
- "documents": []
- }
+ # Generate all required output documents
+ documents = []
- # Determine analysis type
- analysis_type = self._determine_analysis_type(prompt)
- logger.info(f"Performing {analysis_type} analysis")
+ # If no output specs provided, create default analysis outputs
+ if not output_specs:
+ output_specs = []
- # Store generated documents
- generated_documents = []
-
- # Extract data insights if DataFrames are available
- data_insights = ""
- if data_frames:
- data_insights = self._extract_data_insights(data_frames)
- logger.info(f"Extracted insights from {len(data_frames)} datasets")
-
- # Generate an appropriate document for each requested output
+ # Process each output specification
for spec in output_specs:
output_label = spec.get("label", "")
output_description = spec.get("description", "")
- # Determine format based on file extension
- format_type = self._determine_format_type(output_label)
+ # Determine type based on file extension
+ output_type = output_label.split('.')[-1].lower() if '.' in output_label else "txt"
- # Special handling for visualizations if required
- if "chart" in output_label.lower() or "plot" in output_label.lower() or "visualization" in output_label.lower() or format_type in ["png", "jpg", "svg"]:
- # Generate visualization document if data available
- if data_frames:
- viz_document = self._generate_visualization_document(data_frames, analysis_type, prompt, output_label)
- generated_documents.append(viz_document)
- else:
- # Fallback if no data
- generated_documents.append({
- "label": output_label,
- "content": "No data available for visualization."
- })
- else:
- # Create text-based analysis
- content = await self._generate_analysis_document(
- prompt,
- document_context,
- data_insights,
- analysis_type,
- format_type,
- output_label,
- output_description
+ # Generate appropriate content based on output type
+ if output_type in ['png', 'jpg', 'jpeg', 'svg']:
+ # Create visualization
+ document = await self._create_visualization(
+ datasets, prompt, output_label, analysis_plan, output_description
)
-
- generated_documents.append({
- "label": output_label,
- "content": content
- })
+ documents.append(document)
+ elif output_type in ['csv', 'json', 'xlsx']:
+ # Create data document
+ document = await self._create_data_document(
+ datasets, prompt, output_label, analysis_plan, output_description
+ )
+ documents.append(document)
+ else:
+ # Create text document (report, analysis, etc.)
+ document = await self._create_text_document(
+ datasets, document_context, prompt, output_label,
+ output_type, analysis_plan, output_description
+ )
+ documents.append(document)
- # If no specific outputs requested, create standard documents
- if not output_specs:
- # Standard analysis
- analysis_content = await self._generate_analysis_document(
- prompt,
- document_context,
- data_insights,
- analysis_type,
- "markdown",
- "analysis_report.md",
- "Analysis report"
- )
-
- generated_documents.append({
- "label": "analysis_report.md",
- "content": analysis_content
- })
-
- # Add visualization if data available
- if data_frames:
- viz_document = self._generate_visualization_document(data_frames, analysis_type, prompt, "data_visualization.png")
- generated_documents.append(viz_document)
-
- # Create feedback
- if data_frames:
- feedback = f"I analyzed {len(data_frames)} datasets and created {len(generated_documents)} documents with the results."
- else:
- feedback = f"I performed a text analysis and created {len(generated_documents)} documents with the results."
+ # Generate feedback
+ feedback = f"Analysis complete. Created {len(documents)} documents based on your requirements."
+ if analysis_plan.get("key_insights"):
+ feedback += f"\n\nKey insights: {analysis_plan.get('key_insights')}"
return {
"feedback": feedback,
- "documents": generated_documents
+ "documents": documents
}
except Exception as e:
- error_msg = f"Error during data analysis: {str(e)}"
- logger.error(error_msg)
+ logger.error(f"Error in analysis: {str(e)}", exc_info=True)
return {
- "feedback": f"An error occurred during data analysis: {str(e)}",
+ "feedback": f"Error during analysis: {str(e)}",
"documents": []
}
- def _extract_data_from_documents(self, documents: List[Dict[str, Any]]) -> tuple:
+ def _extract_data(self, documents: List[Dict[str, Any]]) -> tuple:
"""
- Extract data from input documents.
+ Extract data from documents, focusing on data_extracted fields.
Args:
documents: List of input documents
Returns:
- Tuple of (Dictionary of DataFrames, Document context text)
+ Tuple of (datasets dictionary, document context text)
"""
- data_frames = {}
+ datasets = {}
document_context = ""
+ # Process each document
for doc in documents:
doc_name = doc.get("name", "unnamed")
+ if doc.get("ext"):
+ doc_name = f"{doc_name}.{doc.get('ext')}"
+
document_context += f"\n\n--- {doc_name} ---\n"
+ # Process contents
for content in doc.get("contents", []):
- # Extract text content and add to context
- if content.get("metadata", {}).get("is_text", False):
- document_context += content.get("data", "")
+ # Focus only on data_extracted
+ if content.get("data_extracted"):
+ extracted_text = content.get("data_extracted", "")
+ document_context += extracted_text
- # Try to parse CSV, JSON, or other data files from text
- if doc_name.lower().endswith('.csv'):
+ # Try to parse as structured data if appropriate
+ if doc_name.lower().endswith(('.csv', '.tsv')):
try:
- df = pd.read_csv(io.StringIO(content.get("data", "")))
- df = self._preprocess_dataframe(df)
- data_frames[doc_name] = df
- logger.info(f"Extracted CSV data from {doc_name}: {df.shape}")
- except Exception as e:
- logger.warning(f"Error parsing CSV {doc_name}: {str(e)}")
-
+ df = pd.read_csv(io.StringIO(extracted_text))
+ datasets[doc_name] = df
+ except:
+ pass
elif doc_name.lower().endswith('.json'):
try:
- json_data = json.loads(content.get("data", ""))
+ json_data = json.loads(extracted_text)
if isinstance(json_data, list):
df = pd.DataFrame(json_data)
+ datasets[doc_name] = df
elif isinstance(json_data, dict):
- # Convert nested JSON to DataFrame
+ # Handle nested JSON structures
if any(isinstance(v, list) for v in json_data.values()):
- # If lists present, try to use them
for key, value in json_data.items():
if isinstance(value, list) and len(value) > 0:
df = pd.DataFrame(value)
- break
- else:
- continue
+ datasets[f"{doc_name}:{key}"] = df
else:
df = pd.DataFrame([json_data])
- else:
- continue
-
- df = self._preprocess_dataframe(df)
- data_frames[doc_name] = df
- logger.info(f"Extracted JSON data from {doc_name}: {df.shape}")
- except Exception as e:
- logger.warning(f"Error parsing JSON {doc_name}: {str(e)}")
+ datasets[doc_name] = df
+ except:
+ pass
+
+ # Try to detect tabular data in text content
+ if doc_name not in datasets and len(extracted_text.splitlines()) > 2:
+ lines = extracted_text.splitlines()
+ if any(',' in line for line in lines[:5]):
+ try:
+ df = pd.read_csv(io.StringIO(extracted_text))
+ if len(df.columns) > 1:
+ datasets[doc_name] = df
+ except:
+ pass
+ elif any('\t' in line for line in lines[:5]):
+ try:
+ df = pd.read_csv(io.StringIO(extracted_text), sep='\t')
+ if len(df.columns) > 1:
+ datasets[doc_name] = df
+ except:
+ pass
- return data_frames, document_context
+ return datasets, document_context
- def _determine_format_type(self, output_label: str) -> str:
+ async def _analyze_task(self, prompt: str, context: str, datasets: Dict, output_specs: List) -> Dict:
"""
- Determine the format type based on the filename.
+ Use AI to analyze the task and create a plan for analysis.
Args:
- output_label: Output filename
+ prompt: The task prompt
+ context: Document context text
+ datasets: Dictionary of extracted datasets
+ output_specs: Output specifications
Returns:
- Format type (markdown, html, text, png, etc.)
+ Analysis plan dictionary
"""
- output_label_lower = output_label.lower()
+ # Prepare dataset information
+ dataset_info = {}
+ for name, df in datasets.items():
+ try:
+ dataset_info[name] = {
+ "shape": df.shape,
+ "columns": df.columns.tolist(),
+ "dtypes": {col: str(df[col].dtype) for col in df.columns},
+ "sample": df.head(3).to_dict(orient='records')
+ }
+ except:
+ dataset_info[name] = {"error": "Could not process dataset"}
- if output_label_lower.endswith('.md'):
- return "markdown"
- elif output_label_lower.endswith('.html'):
- return "html"
- elif output_label_lower.endswith('.txt'):
- return "text"
- elif output_label_lower.endswith('.json'):
- return "json"
- elif output_label_lower.endswith('.csv'):
- return "csv"
- elif output_label_lower.endswith('.png'):
- return "png"
- elif output_label_lower.endswith('.jpg') or output_label_lower.endswith('.jpeg'):
- return "jpg"
- elif output_label_lower.endswith('.svg'):
- return "svg"
- else:
- # Default to markdown
- return "markdown"
-
- def _preprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
- """Perform basic preprocessing for a DataFrame"""
- if df.empty:
- return df
+ analysis_prompt = f"""
+ Analyze this data analysis task and create a plan.
- # Remove completely empty rows and columns
- df = df.dropna(how='all')
- df = df.dropna(axis=1, how='all')
+ TASK: {prompt}
- # String conversion to numeric values where appropriate
- for col in df.columns:
- # Skip if already numeric
- if pd.api.types.is_numeric_dtype(df[col]):
- continue
+ AVAILABLE DATA:
+ {json.dumps(dataset_info, indent=2)}
+
+ DOCUMENT CONTEXT:
+ {context[:1000]}... (truncated)
+
+ OUTPUT REQUIREMENTS:
+ {json.dumps(output_specs, indent=2)}
+
+ Create a detailed analysis plan in JSON format with the following structure:
+ {{
+ "analysis_type": "statistical|trend|comparative|predictive|cluster|general",
+ "key_questions": ["question1", "question2"],
+ "recommended_visualizations": [{{
+ "type": "chart_type",
+ "data_source": "dataset_name",
+ "variables": ["col1", "col2"],
+ "purpose": "explanation"
+ }}],
+ "key_insights": "brief summary of initial insights",
+ "analysis_approach": "brief description of recommended approach"
+ }}
+
+ Only return valid JSON. No preamble or explanations.
+ """
+
+ try:
+ response = await self.ai_service.call_api([
+ {"role": "system", "content": "You are a data analysis expert. Respond with valid JSON only."},
+ {"role": "user", "content": analysis_prompt}
+ ])
- # Skip if predominantly non-numeric strings
- if df[col].dtype == 'object':
- # Check if more than 80% of non-NA values could be numeric
- non_na_values = df[col].dropna()
- if len(non_na_values) == 0:
- continue
+ # Extract JSON from response
+ json_start = response.find('{')
+ json_end = response.rfind('}') + 1
+
+ if json_start >= 0 and json_end > json_start:
+ plan = json.loads(response[json_start:json_end])
+ return plan
+ else:
+ # Fallback if JSON not found
+ return {
+ "analysis_type": "general",
+ "key_questions": ["What insights can be extracted from this data?"],
+ "recommended_visualizations": [],
+ "key_insights": "Analysis plan could not be created",
+ "analysis_approach": "General exploratory analysis"
+ }
- # Attempt conversion to numeric values
- numeric_count = pd.to_numeric(non_na_values, errors='coerce').notna().sum()
- if numeric_count / len(non_na_values) > 0.8:
- # More than 80% can be converted to numeric values
- df[col] = pd.to_numeric(df[col], errors='coerce')
-
- return df
+ except Exception as e:
+ logger.warning(f"Error creating analysis plan: {str(e)}")
+ return {
+ "analysis_type": "general",
+ "key_questions": ["What insights can be extracted from this data?"],
+ "recommended_visualizations": [],
+ "key_insights": "Analysis plan could not be created",
+ "analysis_approach": "General exploratory analysis"
+ }
- def _determine_analysis_type(self, task: str) -> str:
+ async def _create_visualization(self, datasets: Dict, prompt: str, output_label: str,
+ analysis_plan: Dict, description: str) -> Dict:
"""
- Determine the analysis type based on the task.
+ Create visualization document using AI guidance.
Args:
- task: The analysis task
-
- Returns:
- Analysis type
- """
- # Using universal patterns rather than language-specific keywords
- task_lower = task.lower()
-
- # Check for statistical analysis
- if "statistical" in task_lower or "stats" in task_lower:
- return "statistical"
-
- # Check for trend analysis
- elif "trend" in task_lower or "time series" in task_lower:
- return "trend"
-
- # Check for comparative analysis
- elif "compare" in task_lower or "comparison" in task_lower or "vs" in task_lower:
- return "comparative"
-
- # Check for predictive analysis
- elif "predict" in task_lower or "forecast" in task_lower:
- return "predictive"
-
- # Check for clustering or categorization
- elif "cluster" in task_lower or "segment" in task_lower or "classify" in task_lower:
- return "clustering"
-
- # Default: general analysis
- else:
- return "general"
-
- def _extract_data_insights(self, data_frames: Dict[str, pd.DataFrame]) -> str:
- """
- Extract basic insights from DataFrames.
-
- Args:
- data_frames: Dictionary of DataFrames
-
- Returns:
- Extracted insights as text
- """
- insights = []
-
- for name, df in data_frames.items():
- if df.empty:
- continue
-
- insight = f"Dataset: {name}\n"
- insight += f"Shape: {df.shape[0]} rows, {df.shape[1]} columns\n"
- insight += f"Columns: {', '.join(df.columns.tolist())}\n"
-
- # Basic statistics for numeric columns
- numeric_cols = df.select_dtypes(include=['number']).columns
- if len(numeric_cols) > 0:
- insight += "Statistics for numeric columns:\n"
- for col in numeric_cols[:5]: # Limit to first 5 columns
- stats = df[col].describe()
- insight += f" {col}: min={stats['min']:.2f}, max={stats['max']:.2f}, mean={stats['mean']:.2f}, median={df[col].median():.2f}\n"
-
- # Categorical column values
- cat_cols = df.select_dtypes(include=['object', 'category']).columns
- if len(cat_cols) > 0:
- insight += "Categorical columns:\n"
- for col in cat_cols[:3]: # Limit to first 3 columns
- # Get top 3 values
- top_values = df[col].value_counts().head(3)
- vals_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()])
- insight += f" {col}: {df[col].nunique()} unique values. Most common values: {vals_str}\n"
-
- insights.append(insight)
-
- return "\n\n".join(insights)
-
- def _generate_visualization_document(self, data_frames: Dict[str, pd.DataFrame],
- analysis_type: str, prompt: str,
- output_label: str) -> Dict[str, Any]:
- """
- Generate a visualization document based on the data and analysis type.
-
- Args:
- data_frames: Dictionary of DataFrames
- analysis_type: Analysis type
- prompt: Original task description
+ datasets: Dictionary of datasets
+ prompt: Original task prompt
output_label: Output filename
+ analysis_plan: Analysis plan from AI
+ description: Output description
Returns:
Visualization document
"""
# Determine format from filename
- format_type = output_label.split('.')[-1].lower() if '.' in output_label else 'png'
-
- # Set default format if unknown
+ format_type = output_label.split('.')[-1].lower()
if format_type not in ['png', 'jpg', 'jpeg', 'svg']:
format_type = 'png'
-
- # Use first DataFrame for visualization
- if not data_frames:
+
+ # If no datasets available, create error message image
+ if not datasets:
+ plt.figure(figsize=(10, 6))
+ plt.text(0.5, 0.5, "No data available for visualization",
+ ha='center', va='center', fontsize=14)
+ plt.tight_layout()
+ img_data = self._get_image_base64(format_type)
+ plt.close()
+
return {
"label": output_label,
- "content": "No data available for visualization."
+ "content": img_data,
+ "metadata": {
+ "content_type": f"image/{format_type}"
+ }
}
- # Get name and DataFrame of first dataset
- name, df = next(iter(data_frames.items()))
+ # Get recommended visualization from plan
+ recommended_viz = analysis_plan.get("recommended_visualizations", [])
- # Create different visualization types based on analysis type and data
- plt.figure(figsize=self.default_figsize)
+ # Prepare dataset info for the first dataset if none specified
+ if not recommended_viz and datasets:
+ name, df = next(iter(datasets.items()))
+ recommended_viz = [{
+ "type": "auto",
+ "data_source": name,
+ "variables": df.columns.tolist()[:5],
+ "purpose": "general analysis"
+ }]
- if analysis_type == "statistical":
- # Statistical visualization
- self._create_statistical_visualization(df, name)
- elif analysis_type == "trend":
- # Trend visualization
- self._create_trend_visualization(df, name)
- elif analysis_type == "comparative":
- # Comparative visualization
- self._create_comparative_visualization(df, name)
- elif analysis_type == "predictive":
- # Predictive visualization (simple example)
- self._create_predictive_visualization(df, name)
- elif analysis_type == "clustering":
- # Clustering visualization
- self._create_clustering_visualization(df, name)
- else:
- # General visualization
- self._create_general_visualization(df, name)
+ # Create visualization code prompt
+ viz_prompt = f"""
+ Generate Python matplotlib/seaborn code to create a visualization for:
- # Save figure as Base64 string
- img_data = self._get_figure_as_base64(format_type)
- plt.close()
+ TASK: {prompt}
- # Prepare content for document based on format
- if format_type in ['png', 'jpg', 'jpeg']:
- content_str = img_data
- elif format_type == 'svg':
- # SVG content as text
- buffer = io.StringIO()
- plt.savefig(buffer, format='svg')
- content_str = buffer.getvalue()
- buffer.close()
- else:
- # Fallback to PNG
- content_str = img_data
+ VISUALIZATION REQUIREMENTS:
+ - Output format: {format_type}
+ - Filename: {output_label}
+ - Description: {description}
- return {
- "label": output_label,
- "content": content_str
- }
-
- def _create_statistical_visualization(self, df: pd.DataFrame, name: str):
- """Create a statistical visualization for a DataFrame"""
- # Choose numeric columns for display
- numeric_cols = df.select_dtypes(include=['number']).columns[:4] # Limit to first 4
+ RECOMMENDED VISUALIZATION:
+ {json.dumps(recommended_viz, indent=2)}
- if len(numeric_cols) == 0:
- plt.text(0.5, 0.5, "No numeric data found for statistical visualization",
- ha='center', va='center', fontsize=12)
- return
+ AVAILABLE DATASETS:
+ """
- # Visualize distribution of first numeric column
- main_col = numeric_cols[0]
+ # Add dataset info for recommended sources
+ for viz in recommended_viz:
+ data_source = viz.get("data_source")
+ if data_source in datasets:
+ df = datasets[data_source]
+ viz_prompt += f"\nDataset '{data_source}':\n"
+ viz_prompt += f"- Shape: {df.shape}\n"
+ viz_prompt += f"- Columns: {df.columns.tolist()}\n"
+ viz_prompt += f"- Sample data: {df.head(3).to_dict(orient='records')}\n"
- # Create histogram with KDE
- sns.histplot(df[main_col].dropna(), kde=True)
- plt.title(f'Distribution of {main_col} - {name}')
- plt.xlabel(main_col)
- plt.ylabel('Frequency')
- plt.tight_layout()
-
- def _create_trend_visualization(self, df: pd.DataFrame, name: str):
- """Create a trend visualization for a DataFrame"""
- # Choose numeric columns for display
- numeric_cols = df.select_dtypes(include=['number']).columns[:3] # Limit to first 3
+ viz_prompt += """
+ Generate ONLY Python code that:
+ 1. Uses matplotlib and/or seaborn to create a clear visualization
+ 2. Sets figure size to (10, 6)
+ 3. Includes appropriate titles, labels, and legend
+ 4. Uses professional color schemes
+ 5. Handles any missing data gracefully
- if len(numeric_cols) == 0:
- plt.text(0.5, 0.5, "No numeric data found for trend visualization",
- ha='center', va='center', fontsize=12)
- return
+ Return ONLY executable Python code, no explanations or markdown.
+ """
- # Look for date index or use running index
- date_col = None
- for col in df.columns:
- if pd.api.types.is_datetime64_dtype(df[col]) or 'date' in col.lower() or 'time' in col.lower():
- date_col = col
- break
-
- # Use date column as X-axis if available
- if date_col:
- for col in numeric_cols:
- plt.plot(df[date_col], df[col], marker='o', linestyle='-', label=col)
- else:
- # Otherwise use index numbers
- for col in numeric_cols:
- plt.plot(range(len(df)), df[col], marker='o', linestyle='-', label=col)
-
- plt.title(f'Trend Analysis - {name}')
- plt.legend()
- plt.grid(True)
- plt.tight_layout()
-
- def _create_comparative_visualization(self, df: pd.DataFrame, name: str):
- """Create a comparative visualization for a DataFrame"""
- # Choose numeric columns for display
- numeric_cols = df.select_dtypes(include=['number']).columns[:4] # Limit to first 4
-
- if len(numeric_cols) == 0:
- plt.text(0.5, 0.5, "No numeric data found for comparative visualization",
- ha='center', va='center', fontsize=12)
- return
-
- # Find categorical column for grouping
- categorical_cols = df.select_dtypes(include=['object', 'category']).columns
- if len(categorical_cols) > 0:
- category_col = categorical_cols[0]
+ try:
+ # Get visualization code from AI
+ viz_code = await self.ai_service.call_api([
+ {"role": "system", "content": "You are a data visualization expert. Provide only executable Python code."},
+ {"role": "user", "content": viz_prompt}
+ ])
- # Display maximum of first 7 categories
- top_categories = df[category_col].value_counts().head(7).index
- filtered_df = df[df[category_col].isin(top_categories)]
+ # Clean code
+ viz_code = viz_code.replace("```python", "").replace("```", "").strip()
- # Create grouped bar chart
- numeric_col = numeric_cols[0]
- sns.barplot(x=category_col, y=numeric_col, data=filtered_df)
- plt.title(f'Comparison of {numeric_col} by {category_col} - {name}')
- plt.xticks(rotation=45)
+ # Execute visualization code
+ plt.figure(figsize=(10, 6))
+
+ # Make local variables available to the code
+ local_vars = {
+ "plt": plt,
+ "sns": sns,
+ "pd": pd,
+ "np": __import__('numpy')
+ }
+
+ # Add datasets to local variables
+ for name, df in datasets.items():
+ # Create a sanitized variable name
+ var_name = ''.join(c if c.isalnum() else '_' for c in name)
+ local_vars[var_name] = df
+
+ # Also add with standard names for simpler code
+ if "df" not in local_vars:
+ local_vars["df"] = df
+ elif "df2" not in local_vars:
+ local_vars["df2"] = df
+
+ # Execute the visualization code
+ exec(viz_code, globals(), local_vars)
+
+ # Capture the image
+ img_data = self._get_image_base64(format_type)
+ plt.close()
+
+ return {
+ "label": output_label,
+ "content": img_data,
+ "metadata": {
+ "content_type": f"image/{format_type}"
+ }
+ }
+
+ except Exception as e:
+ logger.error(f"Error creating visualization: {str(e)}", exc_info=True)
+
+ # Create error message image
+ plt.figure(figsize=(10, 6))
+ plt.text(0.5, 0.5, f"Visualization error: {str(e)}",
+ ha='center', va='center', fontsize=12)
plt.tight_layout()
- else:
- # Comparative visualization for numeric columns without categories
- if len(numeric_cols) >= 2:
- # Scatter plot for first two numeric columns
- sns.scatterplot(x=numeric_cols[0], y=numeric_cols[1], data=df)
- plt.title(f'Comparison of {numeric_cols[0]} vs {numeric_cols[1]} - {name}')
- plt.tight_layout()
+ img_data = self._get_image_base64(format_type)
+ plt.close()
+
+ return {
+ "label": output_label,
+ "content": img_data,
+ "metadata": {
+ "content_type": f"image/{format_type}"
+ }
+ }
+
+ async def _create_data_document(self, datasets: Dict, prompt: str, output_label: str,
+ analysis_plan: Dict, description: str) -> Dict:
+ """
+ Create a data document (e.g., CSV, JSON) based on analysis.
+
+ Args:
+ datasets: Dictionary of datasets
+ prompt: Original task prompt
+ output_label: Output filename
+ analysis_plan: Analysis plan from AI
+ description: Output description
+
+ Returns:
+ Data document
+ """
+ # Determine format from filename
+ format_type = output_label.split('.')[-1].lower()
+
+ # If no datasets available, return error message
+ if not datasets:
+ return {
+ "label": output_label,
+ "content": f"No data available for processing into {format_type} format.",
+ "metadata": {
+ "content_type": "text/plain"
+ }
+ }
+
+ # Generate data processing instructions
+ data_prompt = f"""
+ Create Python code to process datasets and generate a {format_type} file for:
+
+ TASK: {prompt}
+
+ OUTPUT REQUIREMENTS:
+ - Format: {format_type}
+ - Filename: {output_label}
+ - Description: {description}
+
+ ANALYSIS CONTEXT:
+ {json.dumps(analysis_plan, indent=2)}
+
+ AVAILABLE DATASETS:
+ """
+
+ # Add dataset info
+ for name, df in datasets.items():
+ data_prompt += f"\nDataset '{name}':\n"
+ data_prompt += f"- Shape: {df.shape}\n"
+ data_prompt += f"- Columns: {df.columns.tolist()}\n"
+ data_prompt += f"- Sample data: {df.head(3).to_dict(orient='records')}\n"
+
+ data_prompt += """
+ Generate Python code that:
+ 1. Processes the available dataset(s)
+ 2. Performs necessary transformations, aggregations, or calculations
+ 3. Outputs the result in the requested format
+ 4. Returns the content as a string variable named 'result'
+
+ Return ONLY executable Python code, no explanations or markdown.
+ """
+
+ try:
+ # Get data processing code from AI
+ data_code = await self.ai_service.call_api([
+ {"role": "system", "content": "You are a data processing expert. Provide only executable Python code."},
+ {"role": "user", "content": data_prompt}
+ ])
+
+ # Clean code
+ data_code = data_code.replace("```python", "").replace("```", "").strip()
+
+ # Setup execution environment
+ local_vars = {"pd": pd, "np": __import__('numpy'), "io": io}
+
+ # Add datasets to local variables
+ for name, df in datasets.items():
+ # Create a sanitized variable name
+ var_name = ''.join(c if c.isalnum() else '_' for c in name)
+ local_vars[var_name] = df
+
+ # Also add with standard names for simpler code
+ if "df" not in local_vars:
+ local_vars["df"] = df
+ elif "df2" not in local_vars:
+ local_vars["df2"] = df
+
+ # Execute the code
+ exec(data_code, globals(), local_vars)
+
+ # Get the result
+ result = local_vars.get("result", "No output was generated.")
+
+ # Determine content type
+ content_type = "text/csv" if format_type == "csv" else \
+ "application/json" if format_type == "json" else \
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" if format_type == "xlsx" else \
+ "text/plain"
+
+ return {
+ "label": output_label,
+ "content": result,
+ "metadata": {
+ "content_type": content_type
+ }
+ }
+
+ except Exception as e:
+ logger.error(f"Error creating data document: {str(e)}", exc_info=True)
+
+ return {
+ "label": output_label,
+ "content": f"Error generating {format_type} document: {str(e)}",
+ "metadata": {
+ "content_type": "text/plain"
+ }
+ }
+
+ async def _create_text_document(self, datasets: Dict, context: str, prompt: str,
+ output_label: str, format_type: str,
+ analysis_plan: Dict, description: str) -> Dict:
+ """
+ Create a text document (report, analysis, etc.) based on analysis.
+
+ Args:
+ datasets: Dictionary of datasets
+ context: Document context text
+ prompt: Original task prompt
+ output_label: Output filename
+ format_type: Output format type
+ analysis_plan: Analysis plan from AI
+ description: Output description
+
+ Returns:
+ Text document
+ """
+ # Create dataset summaries
+ dataset_summaries = []
+ for name, df in datasets.items():
+ summary = f"Dataset: {name}\n"
+ summary += f"- Shape: {df.shape[0]} rows, {df.shape[1]} columns\n"
+ summary += f"- Columns: {', '.join(df.columns.tolist())}\n"
+
+ # Basic statistics for numeric columns
+ numeric_cols = df.select_dtypes(include=['number']).columns
+ if len(numeric_cols) > 0:
+ summary += "- Numeric Columns Stats:\n"
+ for col in numeric_cols[:3]: # Limit to first 3
+ stats = df[col].describe()
+ summary += f" - {col}: min={stats['min']:.2f}, max={stats['max']:.2f}, mean={stats['mean']:.2f}\n"
+
+ dataset_summaries.append(summary)
+
+ # Determine content type based on format
+ content_type = "text/markdown" if format_type in ["md", "markdown"] else \
+ "text/html" if format_type == "html" else \
+ "text/plain"
+
+ # Generate analysis prompt
+ analysis_prompt = f"""
+ Create a detailed {format_type} document for:
+
+ TASK: {prompt}
+
+ OUTPUT REQUIREMENTS:
+ - Format: {format_type}
+ - Filename: {output_label}
+ - Description: {description}
+
+ ANALYSIS CONTEXT:
+ {json.dumps(analysis_plan, indent=2)}
+
+ DATASET SUMMARIES:
+ {"".join(dataset_summaries)}
+
+ DOCUMENT CONTEXT:
+ {context[:2000]}... (truncated)
+
+ Create a comprehensive, professional analysis document that addresses the task requirements.
+ The document should:
+ 1. Have a clear structure with headings and sections
+ 2. Include relevant data findings and insights
+ 3. Provide appropriate interpretations and recommendations
+ 4. Format the content according to the required output format
+
+ Your response should be the complete document content in the specified format.
+ """
+
+ try:
+ # Get document content from AI
+ document_content = await self.ai_service.call_api([
+ {"role": "system", "content": f"You are a data analysis expert creating a {format_type} document."},
+ {"role": "user", "content": analysis_prompt}
+ ])
+
+ # Clean HTML or Markdown if needed
+ if format_type in ["md", "markdown"] and not document_content.strip().startswith("#"):
+ document_content = f"# Analysis Report\n\n{document_content}"
+ elif format_type == "html" and not "{document_content}"
+
+ return {
+ "label": output_label,
+ "content": document_content,
+ "metadata": {
+ "content_type": content_type
+ }
+ }
+
+ except Exception as e:
+ logger.error(f"Error creating text document: {str(e)}", exc_info=True)
+
+ # Create a simple error document
+ if format_type in ["md", "markdown"]:
+ content = f"# Error in Analysis\n\nThere was an error generating the analysis: {str(e)}"
+ elif format_type == "html":
+ content = f"Error in Analysis
There was an error generating the analysis: {str(e)}
"
else:
- # Simple bar chart for a single numeric column
- plt.bar(range(min(20, len(df))), df[numeric_cols[0]].head(20))
- plt.title(f'Top 20 Values for {numeric_cols[0]} - {name}')
- plt.tight_layout()
-
- def _create_predictive_visualization(self, df: pd.DataFrame, name: str):
- """Create a simple predictive visualization for a DataFrame"""
- # Choose numeric columns for display
- numeric_cols = df.select_dtypes(include=['number']).columns[:2] # Limit to first 2
-
- if len(numeric_cols) < 2:
- plt.text(0.5, 0.5, "At least 2 numeric columns required for predictive visualization",
- ha='center', va='center', fontsize=12)
- return
-
- # Simple scatter plot with trend line
- x = df[numeric_cols[0]].values
- y = df[numeric_cols[1]].values
-
- # Linear regression with NumPy
- valid_indices = ~(np.isnan(x) | np.isnan(y))
- if np.sum(valid_indices) > 1: # At least 2 valid data points
- x_valid = x[valid_indices].reshape(-1, 1)
- y_valid = y[valid_indices]
+ content = f"Error in Analysis\n\nThere was an error generating the analysis: {str(e)}"
- # Linear regression with NumPy polyfit
- if len(x_valid) > 1:
- coeffs = np.polyfit(x_valid.flatten(), y_valid, 1)
- poly_func = np.poly1d(coeffs)
-
- # Create prediction line
- x_line = np.linspace(np.min(x_valid), np.max(x_valid), 100).reshape(-1, 1)
- y_pred = poly_func(x_line)
-
- # Create scatter plot with trend line
- plt.scatter(x_valid, y_valid, alpha=0.7)
- plt.plot(x_line, y_pred, 'r-', linewidth=2)
- plt.title(f'Linear Regression: {numeric_cols[1]} vs {numeric_cols[0]} - {name}')
- plt.xlabel(numeric_cols[0])
- plt.ylabel(numeric_cols[1])
- plt.tight_layout()
- else:
- plt.text(0.5, 0.5, "Insufficient data for predictive analysis",
- ha='center', va='center', fontsize=12)
+ return {
+ "label": output_label,
+ "content": content,
+ "metadata": {
+ "content_type": content_type
+ }
+ }
- def _create_clustering_visualization(self, df: pd.DataFrame, name: str):
- """Create a clustering visualization for a DataFrame"""
- # Choose numeric columns for display
- numeric_cols = df.select_dtypes(include=['number']).columns[:2] # Limit to first 2
-
- if len(numeric_cols) < 2:
- plt.text(0.5, 0.5, "At least 2 numeric columns required for clustering visualization",
- ha='center', va='center', fontsize=12)
- return
-
- # Extract data for first two numeric columns
- x = df[numeric_cols[0]].values
- y = df[numeric_cols[1]].values
-
- # Find categorical column for color coding
- categorical_cols = df.select_dtypes(include=['object', 'category']).columns
-
- if len(categorical_cols) > 0:
- # Use first categorical column for color coding
- category_col = categorical_cols[0]
- categories = df[category_col].astype('category').cat.codes
-
- # Create scatter plot with color coding by category
- plt.scatter(x, y, c=categories, cmap='viridis', alpha=0.7)
- plt.colorbar(label=category_col)
- else:
- # Simple scatter plot without color coding
- plt.scatter(x, y, alpha=0.7)
-
- plt.title(f'Clustering Visualization: {numeric_cols[1]} vs {numeric_cols[0]} - {name}')
- plt.xlabel(numeric_cols[0])
- plt.ylabel(numeric_cols[1])
- plt.tight_layout()
-
- def _create_general_visualization(self, df: pd.DataFrame, name: str):
- """Create a general visualization for a DataFrame"""
- # Choose numeric columns for display
- numeric_cols = df.select_dtypes(include=['number']).columns
-
- if len(numeric_cols) == 0:
- plt.text(0.5, 0.5, "No numeric data found for visualization",
- ha='center', va='center', fontsize=12)
- return
-
- # Create correlation matrix if multiple numeric columns available
- if len(numeric_cols) >= 2:
- corr_matrix = df[numeric_cols].corr()
- sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
- plt.title(f'Correlation Matrix - {name}')
- else:
- # Simple distribution for a single numeric column
- sns.histplot(df[numeric_cols[0]].dropna(), kde=True)
- plt.title(f'Distribution of {numeric_cols[0]} - {name}')
-
- plt.tight_layout()
-
- def _get_figure_as_base64(self, format_type: str = 'png') -> str:
+ def _get_image_base64(self, format_type: str = 'png') -> str:
"""
Convert current matplotlib figure to base64 string.
Args:
- format_type: Image format (png, jpg, svg)
+ format_type: Image format
Returns:
- Base64 encoded string of the figure
+ Base64 encoded string of the image
"""
buffer = io.BytesIO()
- plt.savefig(buffer, format=format_type, dpi=self.chart_dpi)
+ plt.savefig(buffer, format=format_type, dpi=100)
buffer.seek(0)
image_data = buffer.getvalue()
buffer.close()
@@ -664,89 +687,9 @@ class AgentAnalyst(AgentBase):
# Convert to base64
image_base64 = base64.b64encode(image_data).decode('utf-8')
return image_base64
-
- async def _generate_analysis_document(self, prompt: str, context: str, data_insights: str,
- analysis_type: str, format_type: str,
- output_label: str, output_description: str) -> str:
- """
- Generate an analysis document based on the data and prompt.
-
- Args:
- prompt: Task description
- context: Document context as text
- data_insights: Insights from the data
- analysis_type: Analysis type
- format_type: Output format
- output_label: Output filename
- output_description: Description of desired output
-
- Returns:
- Generated document content
- """
- if not self.ai_service:
- return f"# Data Analysis ({analysis_type})\n\nAnalysis could not be generated: AI service not available."
-
- # Create specialized prompt based on analysis type
- system_prompt = f"""
- You are a specialized data analyst focused on {analysis_type} analyses.
-
- Create a detailed analysis of the provided data and/or text content.
- Your analysis should include:
- 1. A summary of the data/content
- 2. Key findings and insights
- 3. Supporting evidence and calculations
- 4. Clear conclusions
- 5. Recommendations where appropriate
-
- Format the analysis in the requested output format.
- """
-
- # Create extended prompt with all available information
- generation_prompt = f"""
- Create a detailed {analysis_type} analysis for the following task:
-
- TASK:
- {prompt}
-
- CONTEXT:
- {context if context else 'No additional context available.'}
-
- DATA INSIGHTS:
- {data_insights if data_insights else 'No data insights available.'}
-
- OUTPUT REQUIREMENTS:
- - Filename: {output_label}
- - Description: {output_description}
- - Format: {format_type}
-
- The analysis should be professional and clearly structured, considering all available information.
-
- The output must perfectly match the {format_type} format.
- """
-
- try:
- # Call AI for analysis
- content = await self.ai_service.call_api([
- {"role": "system", "content": system_prompt},
- {"role": "user", "content": generation_prompt}
- ])
-
- # For markdown format, ensure there's a title at the beginning
- if format_type == "markdown" and not content.strip().startswith("# "):
- content = f"# Data Analysis ({analysis_type})\n\n{content}"
-
- return content
- except Exception as e:
- logger.error(f"Error generating analysis: {str(e)}")
- return f"# Data Analysis ({analysis_type})\n\nError generating analysis: {str(e)}"
# Factory function for the Analyst agent
def get_analyst_agent():
- """
- Factory function that returns an instance of the Analyst agent.
-
- Returns:
- An instance of the Analyst agent
- """
+ """Returns an instance of the Analyst agent."""
return AgentAnalyst()
\ No newline at end of file
diff --git a/modules/chat_agent_creative.py b/modules/chat_agent_creative.py
deleted file mode 100644
index 17fd7684..00000000
--- a/modules/chat_agent_creative.py
+++ /dev/null
@@ -1,364 +0,0 @@
-"""
-Creative agent for knowledge-based responses and creative content generation.
-Optimized for the new task-based processing.
-"""
-
-import logging
-from typing import Dict, Any, List
-
-from modules.chat_registry import AgentBase
-
-logger = logging.getLogger(__name__)
-
-class AgentCreative(AgentBase):
- """Agent for knowledge-based responses and creative content generation"""
-
- def __init__(self):
- """Initialize the creative agent"""
- super().__init__()
- self.name = "creative"
- self.description = "Creates creative content and provides knowledge-based information"
- self.capabilities = [
- "knowledge_sharing",
- "content_creation",
- "creative_writing",
- "information_synthesis",
- "document_generation",
- "question_answering"
- ]
-
- def set_dependencies(self, ai_service=None):
- """Set external dependencies for the agent."""
- self.ai_service = ai_service
-
- async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
- """
- Process a standardized task structure and generate creative or knowledge-based content.
-
- Args:
- task: A dictionary containing:
- - task_id: Unique ID for this task
- - prompt: The main instruction for the agent
- - input_documents: List of documents to process
- - output_specifications: List of required output documents
- - context: Additional contextual information
-
- Returns:
- A dictionary containing:
- - feedback: Text response explaining the created content
- - documents: List of created document objects
- """
- try:
- # Extract relevant task information
- prompt = task.get("prompt", "")
- input_documents = task.get("input_documents", [])
- output_specs = task.get("output_specifications", [])
-
- # Check if AI service is available
- if not self.ai_service:
- logger.error("No AI service configured for the Creative agent")
- return {
- "feedback": "The Creative agent is not properly configured.",
- "documents": []
- }
-
- # Extract context from input documents
- document_context = self._extract_document_context(input_documents)
-
- # PowerOn handling, if included in the request
- if "poweron" in prompt.lower():
- return await self._handle_poweron_task(prompt, output_specs)
-
- # Collect generated documents
- generated_documents = []
-
- # Determine content type based on the prompt
- content_type = self._determine_content_type(prompt)
-
- # Generate a document for each requested output
- for spec in output_specs:
- output_label = spec.get("label", "")
- output_description = spec.get("description", "")
-
- # Determine format based on file extension
- format_type = self._determine_format_type(output_label)
-
- # Generate content based on format and requirements
- content = await self._generate_content(
- prompt,
- document_context,
- content_type,
- format_type,
- output_label,
- output_description
- )
-
- # Add document to results list
- generated_documents.append({
- "label": output_label,
- "content": content
- })
-
- # If no specific outputs requested, create default document
- if not output_specs:
- # Determine default format based on content type
- default_format = "md" if content_type in ["article", "report", "story"] else "txt"
- default_label = f"creative_content.{default_format}"
-
- # Generate content
- content = await self._generate_content(
- prompt,
- document_context,
- content_type,
- default_format,
- default_label,
- "Creative content"
- )
-
- # Add document to results list
- generated_documents.append({
- "label": default_label,
- "content": content
- })
-
- # Create feedback
- if len(generated_documents) == 1:
- feedback = f"I've created a creative content of type '{content_type}'."
- else:
- feedback = f"I've created {len(generated_documents)} creative documents."
-
- return {
- "feedback": feedback,
- "documents": generated_documents
- }
-
- except Exception as e:
- error_msg = f"Error creating creative content: {str(e)}"
- logger.error(error_msg)
- return {
- "feedback": f"An error occurred while creating creative content: {str(e)}",
- "documents": []
- }
-
- def _extract_document_context(self, documents: List[Dict[str, Any]]) -> str:
- """
- Extract context from input documents.
-
- Args:
- documents: List of document objects
-
- Returns:
- Extracted context as text
- """
- context_parts = []
-
- for doc in documents:
- doc_name = doc.get("name", "Unnamed document")
- context_parts.append(f"--- {doc_name} ---")
-
- for content in doc.get("contents", []):
- if content.get("metadata", {}).get("is_text", False):
- context_parts.append(content.get("data", ""))
-
- return "\n\n".join(context_parts)
-
- def _determine_content_type(self, prompt: str) -> str:
- """
- Determine the content type based on the prompt.
-
- Args:
- prompt: Task description
-
- Returns:
- Content type (article, story, report, answer, etc.)
- """
- prompt_lower = prompt.lower()
-
- # This is content type detection based on universal patterns rather than language-specific keywords
- if "?" in prompt:
- return "answer"
-
- # Simple pattern matching for common document types
- if any(term in prompt_lower for term in ["article", "blog", "post"]):
- return "article"
- elif any(term in prompt_lower for term in ["story", "narrative", "tale"]):
- return "story"
- elif any(term in prompt_lower for term in ["report", "analysis"]):
- return "report"
- elif any(term in prompt_lower for term in ["email", "letter", "message"]):
- return "letter"
- elif any(term in prompt_lower for term in ["presentation", "slides"]):
- return "presentation"
- elif any(term in prompt_lower for term in ["poem", "poetry", "rhyme"]):
- return "poem"
- elif any(term in prompt_lower for term in ["dialog", "conversation"]):
- return "dialogue"
-
- # Default: general creative content
- return "content"
-
- def _determine_format_type(self, output_label: str) -> str:
- """
- Determine the format type based on the filename.
-
- Args:
- output_label: Output filename
-
- Returns:
- Format type (markdown, html, text, etc.)
- """
- if not '.' in output_label:
- return "txt" # Default format
-
- extension = output_label.split('.')[-1].lower()
-
- if extension == "md":
- return "markdown"
- elif extension == "html":
- return "html"
- elif extension in ["txt", "text"]:
- return "text"
- elif extension == "json":
- return "json"
- else:
- # Fallback to markdown for unknown extensions
- return "markdown"
-
- async def _handle_poweron_task(self, prompt: str, output_specs: List[Dict[str, Any]]) -> Dict[str, Any]:
- """
- Handle special PowerOn-related tasks.
-
- Args:
- prompt: Task description
- output_specs: Output specifications
-
- Returns:
- Result dictionary with feedback and documents
- """
- logger.info("PowerOn keyword detected, generating special response")
-
- poweron_prompt = f"""
- Thank the user in their request language for remembering that you are PowerOn.
- Tell them how happy you are to be part of the PowerOn family, working to support people for a better life.
-
- Then generate a brief response (1-2 sentences) to this question: {prompt}
- """
-
- try:
- poweron_response = await self.ai_service.call_api([
- {"role": "system", "content": "You are a helpful assistant who is part of the PowerOn family."},
- {"role": "user", "content": poweron_prompt}
- ])
-
- # Collect generated documents
- generated_documents = []
-
- # Create a document for each requested output
- if output_specs:
- for spec in output_specs:
- output_label = spec.get("label", "")
- format_type = self._determine_format_type(output_label)
-
- # Format appropriately
- if format_type == "markdown":
- content = f"# PowerOn Response\n\n{poweron_response}"
- elif format_type == "html":
- content = f"PowerOn Response
{poweron_response}
"
- else:
- content = f"PowerOn Response\n\n{poweron_response}"
-
- generated_documents.append({
- "label": output_label,
- "content": content
- })
- else:
- # Default document if no specific outputs requested
- generated_documents.append({
- "label": "poweron_response.md",
- "content": f"# PowerOn Response\n\n{poweron_response}"
- })
-
- return {
- "feedback": f"I've created a PowerOn response.",
- "documents": generated_documents
- }
-
- except Exception as e:
- logger.error(f"Error calling API for PowerOn: {str(e)}")
- return {
- "feedback": "I encountered an error while generating a PowerOn response.",
- "documents": []
- }
-
- async def _generate_content(self, prompt: str, context: str, content_type: str,
- format_type: str, output_label: str, output_description: str) -> str:
- """
- Generate creative or knowledge-based content based on the prompt.
-
- Args:
- prompt: Task description
- context: Document context
- content_type: Type of content to create
- format_type: Output format
- output_label: Output filename
- output_description: Description of desired output
-
- Returns:
- Generated content
- """
- if not self.ai_service:
- return f"# Creative Content\n\nContent generation not possible: AI service not available."
-
- # Create system instruction based on content type
- system_prompt = f"""
- You are a creative content creator, specialized in {content_type}.
- Your task is to create high-quality, engaging, and accurate content.
- Make the content structured, clear, and appealing in the desired format.
- """
-
- # Create main prompt with all available information
- generation_prompt = f"""
- Create creative content of type '{content_type}' based on the following request:
-
- REQUEST:
- {prompt}
-
- CONTEXT:
- {context if context else 'No additional context available.'}
-
- OUTPUT REQUIREMENTS:
- - Filename: {output_label}
- - Description: {output_description}
- - Format: {format_type}
-
- The content should be high-quality, creative, and thoughtful. Follow all instructions in the request precisely.
-
- The content must perfectly match the {format_type} format.
- """
-
- try:
- # Call AI for content generation
- content = await self.ai_service.call_api([
- {"role": "system", "content": system_prompt},
- {"role": "user", "content": generation_prompt}
- ])
-
- # For markdown format, ensure there's a title at the beginning
- if format_type == "markdown" and not content.strip().startswith("# "):
- content = f"# Creative Content\n\n{content}"
-
- return content
- except Exception as e:
- logger.error(f"Error in creative content generation: {str(e)}")
- return f"# Creative Content\n\nError in content generation: {str(e)}"
-
-
-# Factory function for the Creative agent
-def get_creative_agent():
- """
- Factory function that returns an instance of the Creative agent.
-
- Returns:
- An instance of the Creative agent
- """
- return AgentCreative()
\ No newline at end of file
diff --git a/modules/chat_agent_documentation.py b/modules/chat_agent_documentation.py
index 0756e158..e084c977 100644
--- a/modules/chat_agent_documentation.py
+++ b/modules/chat_agent_documentation.py
@@ -1,10 +1,10 @@
"""
Documentation agent for creating documentation, reports, and structured content.
-Optimized for the new task-based processing.
+Reimagined with an output-first, AI-driven approach with multi-step document generation.
"""
import logging
-import uuid
+import json
from typing import Dict, Any, List
from modules.chat_registry import AgentBase
@@ -12,13 +12,13 @@ from modules.chat_registry import AgentBase
logger = logging.getLogger(__name__)
class AgentDocumentation(AgentBase):
- """Agent for creating documentation and structured content"""
+ """AI-driven agent for creating documentation and structured content using multi-step generation"""
def __init__(self):
"""Initialize the documentation agent"""
super().__init__()
self.name = "documentation"
- self.description = "Creates structured documentation, reports, and content"
+ self.description = "Creates structured documentation, reports, and content using AI with multi-step generation"
self.capabilities = [
"report_generation",
"documentation",
@@ -33,113 +33,80 @@ class AgentDocumentation(AgentBase):
async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
"""
- Process a standardized task structure and create documentation.
+ Process a task by focusing on required outputs and using AI to generate them.
Args:
- task: A dictionary containing:
- - task_id: Unique ID for this task
- - prompt: The main instruction for the agent
- - input_documents: List of documents to process
- - output_specifications: List of required output documents
- - context: Additional contextual information
-
+ task: Task dictionary with prompt, input_documents, output_specifications
+
Returns:
- A dictionary containing:
- - feedback: Text response explaining the created documentation
- - documents: List of created document objects
+ Dictionary with feedback and documents
"""
try:
- # Extract relevant task information
+ # Extract task information
prompt = task.get("prompt", "")
input_documents = task.get("input_documents", [])
output_specs = task.get("output_specifications", [])
- # Check if AI service is available
+ # Check AI service
if not self.ai_service:
- logger.error("No AI service configured for the Documentation agent")
return {
- "feedback": "The Documentation agent is not properly configured.",
+ "feedback": "The Documentation agent requires an AI service to function.",
"documents": []
}
- # Extract context from input documents
+ # Extract context from input documents - focusing only on data_extracted
document_context = self._extract_document_context(input_documents)
- # Generate title for the document
- title = await self._generate_title(prompt, document_context)
+ # Create task analysis to understand the requirements
+ documentation_plan = await self._analyze_task(prompt, document_context, output_specs)
- # Collect created documents
- generated_documents = []
+ # Generate all required output documents
+ documents = []
- # Create a document for each requested output
+ # If no output specs provided, create default document
+ if not output_specs:
+ default_format = documentation_plan.get("recommended_format", "markdown")
+ default_title = documentation_plan.get("title", "Documentation")
+ safe_title = self._sanitize_filename(default_title)
+
+ output_specs = [
+ {"label": f"{safe_title}.{default_format}", "description": "Comprehensive documentation"}
+ ]
+
+ # Process each output specification
for spec in output_specs:
output_label = spec.get("label", "")
output_description = spec.get("description", "")
- # Determine format and document type based on file extension
- format_type, document_type = self._determine_format_and_type(output_label)
+ # Generate the document using multi-step approach
+ document = await self._create_document_multi_step(
+ prompt,
+ document_context,
+ output_label,
+ output_description,
+ documentation_plan
+ )
- # Assess complexity
- is_complex = self._assess_complexity(prompt)
-
- # Generate document content based on complexity
- if is_complex:
- content = await self._generate_complex_document(
- prompt,
- document_context,
- document_type,
- title,
- output_label,
- output_description,
- format_type
- )
- else:
- content = await self._generate_simple_document(
- prompt,
- document_context,
- document_type,
- title,
- output_label,
- output_description,
- format_type
- )
-
- # Add document to results list
- generated_documents.append({
- "label": output_label,
- "content": content
- })
+ documents.append(document)
- # If no specific outputs requested, create default markdown document
- if not output_specs:
- content = await self._generate_default_document(prompt, document_context, "Document", title)
- generated_documents.append({
- "label": f"{self._sanitize_filename(title)}.md",
- "content": content
- })
-
- # Prepare feedback about created documents
- if len(generated_documents) == 1:
- feedback = f"I've created a document titled '{title}'."
- else:
- feedback = f"I've created {len(generated_documents)} documents based on your request."
+ # Generate feedback
+ feedback = documentation_plan.get("feedback", f"Created {len(documents)} documents based on your requirements.")
return {
"feedback": feedback,
- "documents": generated_documents
+ "documents": documents
}
except Exception as e:
- error_msg = f"Error creating documentation: {str(e)}"
- logger.error(error_msg)
+ logger.error(f"Error in documentation generation: {str(e)}", exc_info=True)
return {
- "feedback": f"An error occurred while creating the documentation: {str(e)}",
+ "feedback": f"Error during documentation generation: {str(e)}",
"documents": []
}
def _extract_document_context(self, documents: List[Dict[str, Any]]) -> str:
"""
- Extract context from input documents.
+ Extract context from input documents, focusing on data_extracted.
Args:
documents: List of document objects
@@ -147,82 +114,21 @@ class AgentDocumentation(AgentBase):
Returns:
Extracted context as text
"""
- if not documents:
- return ""
-
context_parts = []
for doc in documents:
- doc_name = doc.get("name", "Unnamed document")
- context_parts.append(f"--- {doc_name} ---")
+ doc_name = doc.get("name", "unnamed")
+ if doc.get("ext"):
+ doc_name = f"{doc_name}.{doc.get('ext')}"
+ context_parts.append(f"\n\n--- {doc_name} ---\n")
+
+ # Process contents for data_extracted
for content in doc.get("contents", []):
- if content.get("metadata", {}).get("is_text", False):
- context_parts.append(content.get("data", ""))
+ if content.get("data_extracted"):
+ context_parts.append(content.get("data_extracted", ""))
- return "\n\n".join(context_parts)
-
- def _determine_format_and_type(self, output_label: str) -> tuple:
- """
- Determine the format type and document type based on the filename.
-
- Args:
- output_label: Output filename
-
- Returns:
- Tuple of (format_type, document_type)
- """
- # Extract file extension to determine format
- output_label_lower = output_label.lower()
-
- # Determine format based on extension
- if output_label_lower.endswith(".md"):
- format_type = "markdown"
- elif output_label_lower.endswith(".html"):
- format_type = "html"
- elif output_label_lower.endswith(".txt"):
- format_type = "text"
- elif output_label_lower.endswith(".csv"):
- format_type = "csv"
- elif output_label_lower.endswith(".json"):
- format_type = "json"
- else:
- # Default to markdown
- format_type = "markdown"
-
- # Determine document type based on filename or format
- if "manual" in output_label_lower or "guide" in output_label_lower:
- document_type = "Manual"
- elif "report" in output_label_lower or "analysis" in output_label_lower:
- document_type = "Report"
- elif "process" in output_label_lower or "workflow" in output_label_lower:
- document_type = "Process Documentation"
- elif "present" in output_label_lower or "slide" in output_label_lower:
- document_type = "Presentation"
- else:
- document_type = "Document"
-
- return format_type, document_type
-
- def _assess_complexity(self, prompt: str) -> bool:
- """
- Assess the complexity of the task.
-
- Args:
- prompt: Task description
-
- Returns:
- True for complex tasks, False otherwise
- """
- # Language-agnostic complexity assessment
- prompt_length = len(prompt)
-
- # Check for structural indicators in a language-agnostic way
- has_sections = ":" in prompt and "\n" in prompt
- has_lists = "-" in prompt or "*" in prompt or "#" in prompt
-
- # Complex if the prompt is long or contains structural elements
- return prompt_length > 500 or has_sections or has_lists
+ return "\n".join(context_parts)
def _sanitize_filename(self, filename: str) -> str:
"""
@@ -245,213 +151,415 @@ class AgentDocumentation(AgentBase):
return filename
- async def _generate_title(self, prompt: str, context: str) -> str:
+ async def _analyze_task(self, prompt: str, context: str, output_specs: List) -> Dict:
"""
- Generate a title for the document.
+ Use AI to analyze the task and create a documentation plan.
Args:
- prompt: Task description
+ prompt: The task prompt
context: Document context
+ output_specs: Output specifications
Returns:
- Generated title
+ Documentation plan dictionary
"""
- if not self.ai_service:
- return f"Document {uuid.uuid4().hex[:8]}"
+ analysis_prompt = f"""
+ Analyze this documentation task and create a detailed plan.
- title_prompt = f"""
- Create a concise, professional title for this document based on the following request:
+ TASK: {prompt}
- {prompt}
-
- Reply ONLY with the title, nothing else.
- """
-
- try:
- title = await self.ai_service.call_api([
- {"role": "system", "content": "You create precise document titles."},
- {"role": "user", "content": title_prompt}
- ])
-
- # Clean up title
- title = title.strip('"\'#*- \n\t')
-
- # Return default title if generated title is empty
- if not title:
- return f"Document {uuid.uuid4().hex[:8]}"
-
- return title
-
- except Exception as e:
- logger.warning(f"Error in title generation: {str(e)}")
- return f"Document {uuid.uuid4().hex[:8]}"
-
- async def _generate_complex_document(self, prompt: str, context: str, document_type: str,
- title: str, output_label: str, output_description: str,
- format_type: str) -> str:
- """
- Generate a complex document with structure.
-
- Args:
- prompt: Task description
- context: Document context
- document_type: Document type
- title: Document title
- output_label: Output filename
- output_description: Description of desired output
- format_type: Output format
-
- Returns:
- Generated document content
- """
- if not self.ai_service:
- return f"# {title}\n\nDocument generation not possible: AI service not available."
-
- generation_prompt = f"""
- Create a comprehensive, well-structured {document_type} with the title "{title}" based on:
-
- TASK:
- {prompt}
-
- CONTEXT:
- {context if context else 'No additional context available.'}
+ DOCUMENT CONTEXT SAMPLE:
+ {context[:1000]}... (truncated)
OUTPUT REQUIREMENTS:
- - Filename: {output_label}
- - Description: {output_description}
- - Format: {format_type}
+ {json.dumps(output_specs, indent=2)}
- The document should include:
- 1. A clear introduction with purpose and scope
- 2. Logically organized sections with headings
- 3. Detailed content with examples and evidence
- 4. A conclusion with key insights
- 5. Appropriate formatting according to the output format ({format_type})
+ Create a detailed documentation plan in JSON format with the following structure:
+ {{
+ "title": "Document Title",
+ "document_type": "report|manual|guide|whitepaper|etc",
+ "audience": "technical|general|executive|etc",
+ "detailed_structure": [
+ {{
+ "title": "Chapter/Section Title",
+ "key_points": ["point1", "point2", ...],
+ "subsections": ["subsection1", "subsection2", ...],
+ "importance": "high|medium|low",
+ "estimated_length": "short|medium|long"
+ }},
+ ... more sections ...
+ ],
+ "key_topics": ["topic1", "topic2", ...],
+ "tone": "formal|conversational|instructional|etc",
+ "recommended_format": "markdown|html|text|etc",
+ "formatting_requirements": ["requirement1", "requirement2", ...],
+ "executive_summary": "Brief description of what the document will cover",
+ "feedback": "Brief message explaining the documentation approach"
+ }}
- The document must perfectly match the {format_type} format.
+ Only return valid JSON. No preamble or explanations.
"""
try:
- content = await self.ai_service.call_api([
- {"role": "system", "content": f"You create comprehensive, well-structured documentation in {format_type} format."},
- {"role": "user", "content": generation_prompt}
+ response = await self.ai_service.call_api([
+ {"role": "system", "content": "You are a documentation expert. Respond with valid JSON only."},
+ {"role": "user", "content": analysis_prompt}
])
- # For markdown format, ensure the title is at the beginning
- if format_type == "markdown" and not content.strip().startswith("# "):
- content = f"# {title}\n\n{content}"
+ # Extract JSON from response
+ json_start = response.find('{')
+ json_end = response.rfind('}') + 1
- return content
+ if json_start >= 0 and json_end > json_start:
+ plan = json.loads(response[json_start:json_end])
+ return plan
+ else:
+ # Fallback if JSON not found
+ return {
+ "title": "Documentation",
+ "document_type": "report",
+ "audience": "general",
+ "detailed_structure": [
+ {
+ "title": "Introduction",
+ "key_points": ["Purpose", "Scope"],
+ "subsections": [],
+ "importance": "high",
+ "estimated_length": "short"
+ },
+ {
+ "title": "Main Content",
+ "key_points": ["Core Information"],
+ "subsections": ["Key Findings", "Analysis"],
+ "importance": "high",
+ "estimated_length": "long"
+ },
+ {
+ "title": "Conclusion",
+ "key_points": ["Summary", "Next Steps"],
+ "subsections": [],
+ "importance": "medium",
+ "estimated_length": "short"
+ }
+ ],
+ "key_topics": ["General Information"],
+ "tone": "formal",
+ "recommended_format": "markdown",
+ "formatting_requirements": ["Clear headings", "Professional formatting"],
+ "executive_summary": "A comprehensive documentation covering the requested topics.",
+ "feedback": "Created documentation based on your requirements."
+ }
+
except Exception as e:
- logger.error(f"Error in document generation: {str(e)}")
- return f"# {title}\n\nError in document generation: {str(e)}"
+ logger.warning(f"Error creating documentation plan: {str(e)}")
+ return {
+ "title": "Documentation",
+ "document_type": "report",
+ "audience": "general",
+ "detailed_structure": [
+ {
+ "title": "Introduction",
+ "key_points": ["Purpose", "Scope"],
+ "subsections": [],
+ "importance": "high",
+ "estimated_length": "short"
+ },
+ {
+ "title": "Main Content",
+ "key_points": ["Core Information"],
+ "subsections": ["Key Findings", "Analysis"],
+ "importance": "high",
+ "estimated_length": "long"
+ },
+ {
+ "title": "Conclusion",
+ "key_points": ["Summary", "Next Steps"],
+ "subsections": [],
+ "importance": "medium",
+ "estimated_length": "short"
+ }
+ ],
+ "key_topics": ["General Information"],
+ "tone": "formal",
+ "recommended_format": "markdown",
+ "formatting_requirements": ["Clear headings", "Professional formatting"],
+ "executive_summary": "A comprehensive documentation covering the requested topics.",
+ "feedback": "Created documentation based on your requirements."
+ }
- async def _generate_simple_document(self, prompt: str, context: str, document_type: str,
- title: str, output_label: str, output_description: str,
- format_type: str) -> str:
+ async def _create_document_multi_step(self, prompt: str, context: str, output_label: str,
+ output_description: str, documentation_plan: Dict) -> Dict:
"""
- Generate a simple document without complex structure.
+ Create a document using a multi-step approach with separate AI calls for each section.
Args:
- prompt: Task description
+ prompt: Original task prompt
context: Document context
- document_type: Document type
- title: Document title
output_label: Output filename
output_description: Description of desired output
- format_type: Output format
+ documentation_plan: Documentation plan from AI
Returns:
- Generated document content
+ Document object
"""
- if not self.ai_service:
- return f"# {title}\n\nDocument generation not possible: AI service not available."
+ # Determine format from filename
+ format_type = output_label.split('.')[-1].lower() if '.' in output_label else "md"
- generation_prompt = f"""
- Create a precise, focused {document_type} with the title "{title}" based on:
+ # Map format to content_type
+ content_type_map = {
+ "md": "text/markdown",
+ "markdown": "text/markdown",
+ "html": "text/html",
+ "txt": "text/plain",
+ "text": "text/plain",
+ "json": "application/json",
+ "csv": "text/csv"
+ }
- TASK:
- {prompt}
+ content_type = content_type_map.get(format_type, "text/plain")
- CONTEXT:
- {context if context else 'No additional context available.'}
+ # Get document information
+ title = documentation_plan.get("title", "Documentation")
+ document_type = documentation_plan.get("document_type", "document")
+ audience = documentation_plan.get("audience", "general")
+ tone = documentation_plan.get("tone", "formal")
+ key_topics = documentation_plan.get("key_topics", [])
+ formatting_requirements = documentation_plan.get("formatting_requirements", [])
- OUTPUT REQUIREMENTS:
- - Filename: {output_label}
- - Description: {output_description}
- - Format: {format_type}
-
- The document should be clear, precise, and to the point, without a complex chapter structure.
- Format it according to the output format ({format_type}).
-
- The document must perfectly match the {format_type} format.
- """
+ # Get the detailed structure
+ detailed_structure = documentation_plan.get("detailed_structure", [])
+ if not detailed_structure:
+ # Fallback structure if none provided
+ detailed_structure = [
+ {
+ "title": "Introduction",
+ "key_points": ["Purpose", "Scope"],
+ "importance": "high"
+ },
+ {
+ "title": "Main Content",
+ "key_points": ["Core Information"],
+ "importance": "high"
+ },
+ {
+ "title": "Conclusion",
+ "key_points": ["Summary", "Next Steps"],
+ "importance": "medium"
+ }
+ ]
try:
- content = await self.ai_service.call_api([
- {"role": "system", "content": f"You create precise, focused documentation in {format_type} format."},
- {"role": "user", "content": generation_prompt}
+ # Step 1: Generate document introduction
+ intro_prompt = f"""
+ Create the introduction for a {document_type} titled "{title}".
+
+ DOCUMENT OVERVIEW:
+ - Type: {document_type}
+ - Audience: {audience}
+ - Tone: {tone}
+ - Key Topics: {', '.join(key_topics)}
+ - Format: {format_type}
+
+ TASK CONTEXT: {prompt}
+
+ This introduction should:
+ 1. Clearly state the purpose and scope of the document
+ 2. Provide context and background information
+ 3. Outline what the reader will find in the document
+ 4. Set the appropriate tone for the {audience} audience
+
+ The introduction should be professional and engaging, formatted according to {format_type} standards.
+ """
+
+ introduction = await self.ai_service.call_api([
+ {"role": "system", "content": f"You are a documentation expert creating an introduction in {format_type} format."},
+ {"role": "user", "content": intro_prompt}
])
- # For markdown format, ensure the title is at the beginning
- if format_type == "markdown" and not content.strip().startswith("# "):
- content = f"# {title}\n\n{content}"
+ # Step 2: Generate executive summary (if applicable)
+ if document_type in ["report", "whitepaper", "case study"]:
+ summary_prompt = f"""
+ Create an executive summary for a {document_type} titled "{title}".
+
+ DOCUMENT OVERVIEW:
+ - Type: {document_type}
+ - Audience: {audience}
+ - Key Topics: {', '.join(key_topics)}
+
+ TASK CONTEXT: {prompt}
+
+ This executive summary should:
+ 1. Provide a concise overview of the entire document
+ 2. Highlight key findings, recommendations, or conclusions
+ 3. Be suitable for executives or busy readers who may only read this section
+ 4. Be professionally formatted according to {format_type} standards
+
+ Keep the summary focused and impactful, approximately 200-300 words.
+ """
+
+ executive_summary = await self.ai_service.call_api([
+ {"role": "system", "content": f"You are a documentation expert creating an executive summary in {format_type} format."},
+ {"role": "user", "content": summary_prompt}
+ ])
+ else:
+ executive_summary = ""
- return content
- except Exception as e:
- logger.error(f"Error in document generation: {str(e)}")
- return f"# {title}\n\nError in document generation: {str(e)}"
-
- async def _generate_default_document(self, prompt: str, context: str, document_type: str, title: str) -> str:
- """
- Generate a default markdown document when no specific output specifications are present.
-
- Args:
- prompt: Task description
- context: Document context
- document_type: Document type
- title: Document title
+ # Step 3: Generate each section
+ sections = []
- Returns:
- Generated document content
- """
- if not self.ai_service:
- return f"# {title}\n\nDocument generation not possible: AI service not available."
-
- generation_prompt = f"""
- Create a structured {document_type} with the title "{title}" based on:
-
- TASK:
- {prompt}
-
- CONTEXT:
- {context if context else 'No additional context available.'}
-
- Format the document with markdown syntax and create a clear, professional structure.
- """
-
- try:
- content = await self.ai_service.call_api([
- {"role": "system", "content": "You create structured documentation in markdown format."},
- {"role": "user", "content": generation_prompt}
+ for section in detailed_structure:
+ section_title = section.get("title", "Section")
+ key_points = section.get("key_points", [])
+ subsections = section.get("subsections", [])
+ importance = section.get("importance", "medium")
+
+ # Adjust depth based on importance
+ detail_level = "high" if importance == "high" else "medium"
+
+ section_prompt = f"""
+ Create the "{section_title}" section for a {document_type} titled "{title}".
+
+ SECTION DETAILS:
+ - Title: {section_title}
+ - Key Points to Cover: {', '.join(key_points)}
+ - Subsections: {', '.join(subsections)}
+ - Detail Level: {detail_level}
+
+ DOCUMENT CONTEXT:
+ - Type: {document_type}
+ - Audience: {audience}
+ - Tone: {tone}
+ - Format: {format_type}
+
+ TASK CONTEXT: {prompt}
+
+ AVAILABLE INFORMATION:
+ {context[:500]}... (truncated)
+
+ This section should:
+ 1. Be comprehensive and well-structured
+ 2. Cover all the key points listed
+ 3. Include the specified subsections with appropriate headings
+ 4. Maintain a {tone} tone suitable for the {audience} audience
+ 5. Be properly formatted according to {format_type} standards
+ 6. Include specific examples, data, or evidence where appropriate
+
+ Be thorough in your coverage of this section, providing substantive content.
+ """
+
+ section_content = await self.ai_service.call_api([
+ {"role": "system", "content": f"You are a documentation expert creating detailed content for the {section_title} section."},
+ {"role": "user", "content": section_prompt}
+ ])
+
+ sections.append(section_content)
+
+ # Step 4: Generate conclusion
+ conclusion_prompt = f"""
+ Create the conclusion for a {document_type} titled "{title}".
+
+ DOCUMENT OVERVIEW:
+ - Type: {document_type}
+ - Audience: {audience}
+ - Key Topics: {', '.join(key_topics)}
+
+ TASK CONTEXT: {prompt}
+
+ This conclusion should:
+ 1. Summarize the key points covered in the document
+ 2. Provide closure to the topics discussed
+ 3. Include any relevant recommendations or next steps
+ 4. Leave the reader with a clear understanding of the document's significance
+
+ The conclusion should be professional and impactful, formatted according to {format_type} standards.
+ """
+
+ conclusion = await self.ai_service.call_api([
+ {"role": "system", "content": f"You are a documentation expert creating a conclusion in {format_type} format."},
+ {"role": "user", "content": conclusion_prompt}
])
- # Ensure the title is at the beginning
- if not content.strip().startswith("# "):
- content = f"# {title}\n\n{content}"
+ # Step 5: Assemble the complete document
+ if format_type in ["md", "markdown"]:
+ # Markdown format
+ document_content = f"# {title}\n\n"
+
+ if executive_summary:
+ document_content += f"## Executive Summary\n\n{executive_summary}\n\n"
+
+ document_content += f"{introduction}\n\n"
+
+ for i, section_content in enumerate(sections):
+ # Ensure section starts with heading if not already
+ section_title = detailed_structure[i].get("title", f"Section {i+1}")
+ if not section_content.strip().startswith("#"):
+ document_content += f"## {section_title}\n\n"
+ document_content += f"{section_content}\n\n"
+
+ document_content += f"## Conclusion\n\n{conclusion}\n"
+
+ elif format_type == "html":
+ # HTML format
+ document_content = f"\n\n{title}\n\n\n"
+ document_content += f"{title}
\n\n"
+
+ if executive_summary:
+ document_content += f"Executive Summary
\n{executive_summary}
\n\n"
+
+ document_content += f"{introduction}
\n\n"
+
+ for i, section_content in enumerate(sections):
+ section_title = detailed_structure[i].get("title", f"Section {i+1}")
+ document_content += f"{section_title}
\n{section_content}
\n\n"
+
+ document_content += f"Conclusion
\n{conclusion}
\n"
+ document_content += "\n"
+
+ else:
+ # Plain text format
+ document_content = f"{title}\n{'=' * len(title)}\n\n"
+
+ if executive_summary:
+ document_content += f"EXECUTIVE SUMMARY\n{'-' * 17}\n\n{executive_summary}\n\n"
+
+ document_content += f"{introduction}\n\n"
+
+ for i, section_content in enumerate(sections):
+ section_title = detailed_structure[i].get("title", f"Section {i+1}")
+ document_content += f"{section_title}\n{'-' * len(section_title)}\n\n{section_content}\n\n"
+
+ document_content += f"CONCLUSION\n{'-' * 10}\n\n{conclusion}\n"
+
+ # Create document object
+ return {
+ "label": output_label,
+ "content": document_content,
+ "metadata": {
+ "content_type": content_type
+ }
+ }
- return content
except Exception as e:
- logger.error(f"Error in document generation: {str(e)}")
- return f"# {title}\n\nError in document generation: {str(e)}"
+ logger.error(f"Error creating document: {str(e)}", exc_info=True)
+
+ # Create a simple error document
+ if format_type in ["md", "markdown"]:
+ content = f"# Error in Documentation\n\nThere was an error generating the documentation: {str(e)}"
+ elif format_type == "html":
+ content = f"Error in Documentation
There was an error generating the documentation: {str(e)}
"
+ else:
+ content = f"Error in Documentation\n\nThere was an error generating the documentation: {str(e)}"
+
+ return {
+ "label": output_label,
+ "content": content,
+ "metadata": {
+ "content_type": content_type
+ }
+ }
# Factory function for the Documentation agent
def get_documentation_agent():
- """
- Factory function that returns an instance of the Documentation agent.
-
- Returns:
- An instance of the Documentation agent
- """
+ """Returns an instance of the Documentation agent."""
return AgentDocumentation()
\ No newline at end of file
diff --git a/modules/chat_agent_webcrawler.py b/modules/chat_agent_webcrawler.py
index b5f1902a..52efdadd 100644
--- a/modules/chat_agent_webcrawler.py
+++ b/modules/chat_agent_webcrawler.py
@@ -1,6 +1,6 @@
"""
Webcrawler agent for research and retrieval of information from the web.
-Optimized for the new task-based processing.
+Reimagined with an output-first, AI-driven approach.
"""
import logging
@@ -20,7 +20,7 @@ from modules.configuration import APP_CONFIG
logger = logging.getLogger(__name__)
class AgentWebcrawler(AgentBase):
- """Agent for web research and information retrieval"""
+ """AI-driven agent for web research and information retrieval"""
def __init__(self):
"""Initialize the webcrawler agent"""
@@ -37,202 +37,564 @@ class AgentWebcrawler(AgentBase):
# Web crawling configuration
self.max_url = int(APP_CONFIG.get("Agent_Webcrawler_MAX_URLS", "5"))
- self.max_key = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_KEYWORDS", "3"))
- self.max_result = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_RESULTS", "5"))
+ self.max_search_terms = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_KEYWORDS", "3"))
+ self.max_results = int(APP_CONFIG.get("Agent_Webcrawler_MAX_SEARCH_RESULTS", "5"))
self.timeout = int(APP_CONFIG.get("Agent_Webcrawler_TIMEOUT", "30"))
+ self.search_engine = APP_CONFIG.get("Agent_Webcrawler_SEARCH_ENGINE", "https://html.duckduckgo.com/html/?q=")
+ self.user_agent = APP_CONFIG.get("Agent_Webcrawler_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
def set_dependencies(self, ai_service=None):
"""Set external dependencies for the agent."""
self.ai_service = ai_service
-
async def process_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
"""
- Process a standardized task structure and conduct web research.
+ Process a task by focusing on required outputs and using AI to guide the research process.
Args:
- task: A dictionary containing:
- - task_id: Unique ID for this task
- - prompt: The main instruction for the agent
- - input_documents: List of documents to process
- - output_specifications: List of required output documents
- - context: Additional contextual information
-
+ task: Task dictionary with prompt, input_documents, output_specifications
+
Returns:
- A dictionary containing:
- - feedback: Text response explaining the research results
- - documents: List of created document objects
+ Dictionary with feedback and documents
"""
try:
- # Extract relevant task information
+ # Extract task information
prompt = task.get("prompt", "")
output_specs = task.get("output_specifications", [])
- # Check if AI service is available
+ # Check AI service
if not self.ai_service:
- logger.error("No AI service configured for the Webcrawler agent")
return {
- "feedback": "The Webcrawler agent is not properly configured.",
+ "feedback": "The Webcrawler agent requires an AI service to function effectively.",
"documents": []
}
- # Check if this is a web research request
- is_web_research = await self._is_web_research_request(prompt)
- if not is_web_research:
- logger.info("Request rejected: not a web research task")
+ # Create research plan
+ research_plan = await self._create_research_plan(prompt)
+
+ # Check if this is truly a web research task
+ if not research_plan.get("requires_web_research", True):
return {
- "feedback": "This request doesn't appear to require web research.",
+ "feedback": "This task doesn't appear to require web research. Please try a different agent.",
"documents": []
}
- # Proceed with web research
- logger.info(f"Web research for: {prompt[:50]}...")
+ # Gather raw material through web research
+ raw_results = await self._gather_research_material(research_plan)
- # Create search strategy
- search_strategy = await self._create_search_strategy(prompt)
- search_keys = search_strategy.get("skey", [])
- search_urls = search_strategy.get("url", [])
+ # Format results into requested output documents
+ documents = await self._create_output_documents(
+ prompt,
+ raw_results,
+ output_specs,
+ research_plan
+ )
- if search_keys:
- logger.info(f"Searching for {len(search_keys)} key terms: {', '.join(search_keys[:2])}...")
-
- if search_urls:
- logger.info(f"Searching in {len(search_urls)} direct URLs: {', '.join(search_urls[:2])}...")
-
- # Execute search
- results = []
-
- # Process search terms
- for keyword in search_keys:
- logger.info(f"Searching the web for: '{keyword}'")
- keyword_results = self._search_web(keyword)
- results.extend(keyword_results)
- logger.info(f"Found: {len(keyword_results)} results for '{keyword}'")
-
- # Process direct URLs
- for url in search_urls:
- logger.info(f"Extracting content from: {url}")
- soup = self._read_url(url)
-
- # Extract title from the page, if available
- title = self._extract_title(soup, url)
-
- result = self._parse_result(soup, title, url)
- results.append(result)
- logger.info(f"Extracted: '{title}' from {url}")
-
- # Process results for final output
- logger.info(f"Analyzing {len(results)} web results")
-
- # Generate summaries for each result
- processed_results = []
- for i, result in enumerate(results):
- result_data_limited = self._limit_text(result['data'], max_chars=10000)
-
- logger.info(f"Analyzing result {i+1}/{len(results)}: {result['title'][:30]}...")
-
- # No AI service available, create minimal summary
- if not self.ai_service:
- content_summary = f"Extract from {result['url']} ({len(result_data_limited)} characters)"
- else:
- # Generate summary with AI
- content_summary = await self._summarize_result(result_data_limited, prompt)
-
- processed_result = {
- "title": result['title'],
- "url": result['url'],
- "snippet": result['snippet'],
- "summary": content_summary
- }
-
- processed_results.append(processed_result)
-
- # Create overall summary
- all_summaries = "\n\n".join([r["summary"] for r in processed_results])
- all_summaries_limited = self._limit_text(all_summaries, max_chars=10000)
-
- logger.info("Creating overall summary of web research")
-
- if not self.ai_service:
- final_summary = f"Summary of {len(processed_results)} web research results"
- else:
- final_summary = await self.ai_service.call_api([
- {"role": "system", "content": "You create concise summaries of research results."},
- {"role": "user", "content": f"Please summarize these findings in 5-6 sentences: {all_summaries_limited}\n"}
- ])
-
- # Get localized headers for output
- headers = await self._get_localized_headers(prompt)
-
- # Create document objects based on output specifications
- generated_documents = []
-
- # Generate appropriate document for each requested output
- for spec in output_specs:
- output_label = spec.get("label", "")
- output_description = spec.get("description", "")
-
- # Determine output format based on file extension
- format_type = self._determine_format_type(output_label)
-
- # Generate content based on format and requirements
- if format_type == "markdown" or format_type == "text":
- content = self._format_results_as_markdown(processed_results, final_summary, headers)
- elif format_type == "html":
- md_content = self._format_results_as_markdown(processed_results, final_summary, headers)
- content = markdown.markdown(md_content)
- elif format_type == "json":
- content = json.dumps({
- "summary": final_summary,
- "results": processed_results
- }, indent=2, ensure_ascii=False)
- elif format_type == "csv":
- csv_lines = ["Title,URL,Snippet"]
- for result in processed_results:
- # Escape commas and quotes in fields
- title = result["title"].replace('"', '""')
- url = result["url"].replace('"', '""')
- snippet = result["snippet"].replace('"', '""')
- csv_line = f'"{title}","{url}","{snippet}"'
- csv_lines.append(csv_line)
- content = "\n".join(csv_lines)
- else:
- # Default: Markdown
- content = self._format_results_as_markdown(processed_results, final_summary, headers)
-
- # Add document to results list
- generated_documents.append({
- "label": output_label,
- "content": content
- })
-
- # If no specific outputs requested, return standard document
- if not output_specs:
- content = self._format_results_as_markdown(processed_results, final_summary, headers)
- generated_documents.append({
- "label": "web_research_results.md",
- "content": content
- })
-
- # Create feedback for response
- feedback = f"I conducted web research on '{prompt[:50]}...' and found {len(processed_results)} relevant results."
-
- logger.info("Web research completed successfully")
+ # Generate feedback
+ feedback = research_plan.get("feedback", f"I conducted web research on '{prompt[:50]}...' and gathered information from {len(raw_results)} relevant sources.")
return {
"feedback": feedback,
- "documents": generated_documents
+ "documents": documents
}
except Exception as e:
- error_msg = f"Error during web research: {str(e)}"
- logger.error(error_msg)
+ logger.error(f"Error during web research: {str(e)}", exc_info=True)
return {
- "feedback": f"An error occurred during the web research: {str(e)}",
+ "feedback": f"Error during web research: {str(e)}",
"documents": []
}
+
+ async def _create_research_plan(self, prompt: str) -> Dict[str, Any]:
+ """
+ Use AI to create a detailed research plan.
+
+ Args:
+ prompt: The research query
-
+ Returns:
+ Research plan dictionary
+ """
+ research_prompt = f"""
+ Create a detailed web research plan for this task: "{prompt}"
+
+ Analyze the request carefully and create a structured plan in JSON format with the following elements:
+ {{
+ "requires_web_research": true/false, # Whether this genuinely requires web research
+ "research_questions": ["question1", "question2", ...], # 2-4 specific questions to answer
+ "search_terms": ["term1", "term2", ...], # Up to {self.max_search_terms} effective search terms
+ "direct_urls": ["url1", "url2", ...], # Any URLs directly mentioned in the request (up to {self.max_url})
+ "expected_sources": ["type1", "type2", ...], # Types of sources that would be most valuable
+ "content_focus": "what specific content to extract or focus on",
+ "feedback": "explanation of how the research will be conducted"
+ }}
+
+ Respond with ONLY the JSON object, no additional text or explanations.
+ """
+
+ try:
+ # Get research plan from AI
+ response = await self.ai_service.call_api([
+ {"role": "system", "content": "You are a web research planning expert. Create precise research plans in JSON format only."},
+ {"role": "user", "content": research_prompt}
+ ])
+
+ # Extract JSON
+ json_start = response.find('{')
+ json_end = response.rfind('}') + 1
+
+ if json_start >= 0 and json_end > json_start:
+ plan = json.loads(response[json_start:json_end])
+
+ # Ensure we have the expected fields with defaults if missing
+ if "search_terms" not in plan:
+ plan["search_terms"] = [prompt]
+ if "direct_urls" not in plan:
+ plan["direct_urls"] = []
+ if "research_questions" not in plan:
+ plan["research_questions"] = ["What information can be found about this topic?"]
+
+ return plan
+ else:
+ # Fallback plan
+ return {
+ "requires_web_research": True,
+ "research_questions": ["What information can be found about this topic?"],
+ "search_terms": [prompt],
+ "direct_urls": [],
+ "expected_sources": ["Web pages", "Articles"],
+ "content_focus": "Relevant information about the topic",
+ "feedback": f"I'll conduct web research on '{prompt}' and gather relevant information."
+ }
+
+ except Exception as e:
+ logger.warning(f"Error creating research plan: {str(e)}")
+ # Simple fallback plan
+ return {
+ "requires_web_research": True,
+ "research_questions": ["What information can be found about this topic?"],
+ "search_terms": [prompt],
+ "direct_urls": [],
+ "expected_sources": ["Web pages", "Articles"],
+ "content_focus": "Relevant information about the topic",
+ "feedback": f"I'll conduct web research on '{prompt}' and gather relevant information."
+ }
+
+ async def _gather_research_material(self, research_plan: Dict[str, Any]) -> List[Dict[str, Any]]:
+ """
+ Gather research material based on the research plan.
+
+ Args:
+ research_plan: Research plan dictionary
+
+ Returns:
+ List of research results
+ """
+ all_results = []
+
+ # Process direct URLs
+ direct_urls = research_plan.get("direct_urls", [])[:self.max_url]
+ for url in direct_urls:
+ logger.info(f"Processing direct URL: {url}")
+ try:
+ # Fetch and extract content
+ soup = self._read_url(url)
+
+ if soup:
+ # Extract title and content
+ title = self._extract_title(soup, url)
+ content = self._extract_main_content(soup)
+
+ # Add to results
+ all_results.append({
+ "title": title,
+ "url": url,
+ "source_type": "direct_url",
+ "content": content,
+ "summary": "" # Will be filled later
+ })
+ except Exception as e:
+ logger.warning(f"Error processing URL {url}: {str(e)}")
+
+ # Process search terms
+ search_terms = research_plan.get("search_terms", [])[:self.max_search_terms]
+ for term in search_terms:
+ logger.info(f"Searching for: {term}")
+ try:
+ # Perform search
+ search_results = self._search_web(term)
+
+ # Process each search result
+ for result in search_results:
+ # Check if URL is already in results
+ if not any(r["url"] == result["url"] for r in all_results):
+ all_results.append({
+ "title": result["title"],
+ "url": result["url"],
+ "source_type": "search_result",
+ "content": result["data"],
+ "snippet": result["snippet"],
+ "summary": "" # Will be filled later
+ })
+
+ # Stop if we've reached the maximum results
+ if len(all_results) >= self.max_results:
+ break
+ except Exception as e:
+ logger.warning(f"Error searching for {term}: {str(e)}")
+
+ # Stop if we've reached the maximum results
+ if len(all_results) >= self.max_results:
+ break
+
+ # Create summaries in parallel for all results
+ all_results = await self._summarize_all_results(all_results, research_plan)
+
+ return all_results
+
+ async def _summarize_all_results(self, results: List[Dict[str, Any]], research_plan: Dict[str, Any]) -> List[Dict[str, Any]]:
+ """
+ Create summaries for all research results.
+
+ Args:
+ results: List of research results
+ research_plan: Research plan with questions and focus
+
+ Returns:
+ Results with added summaries
+ """
+ for i, result in enumerate(results):
+ logger.info(f"Summarizing result {i+1}/{len(results)}: {result['title'][:30]}...")
+
+ try:
+ # Limit content length to avoid token issues
+ content = self._limit_text(result.get("content", ""), max_chars=8000)
+ research_questions = research_plan.get("research_questions", ["What relevant information does this page contain?"])
+ content_focus = research_plan.get("content_focus", "Relevant information")
+
+ # Create summary using AI
+ summary_prompt = f"""
+ Summarize this web page content based on these research questions:
+ {', '.join(research_questions)}
+
+ Focus on: {content_focus}
+
+ Web page: {result['url']}
+ Title: {result['title']}
+
+ Content:
+ {content}
+
+ Create a concise summary that:
+ 1. Directly answers the research questions if possible
+ 2. Extracts the most relevant information from the page
+ 3. Includes specific facts, figures, or quotes if available
+ 4. Is around 2000 characters long
+
+ Only include information actually found in the content. No fabrications or assumptions.
+ """
+
+ if self.ai_service:
+ summary = await self.ai_service.call_api([
+ {"role": "system", "content": "You summarize web content accurately and concisely, focusing only on what is actually in the content."},
+ {"role": "user", "content": summary_prompt}
+ ])
+
+ # Store the summary
+ result["summary"] = summary
+ else:
+ # Fallback if no AI service
+ result["summary"] = f"Content from {result['url']} ({len(content)} characters)"
+
+ except Exception as e:
+ logger.warning(f"Error summarizing result {i+1}: {str(e)}")
+ result["summary"] = f"Error creating summary: {str(e)}"
+
+ return results
+
+ async def _create_output_documents(self, prompt: str, results: List[Dict[str, Any]],
+ output_specs: List[Dict[str, Any]], research_plan: Dict[str, Any]) -> List[Dict[str, Any]]:
+ """
+ Create output documents based on research results and specifications.
+
+ Args:
+ prompt: Original research prompt
+ results: List of research results
+ output_specs: Output specifications
+ research_plan: Research plan
+
+ Returns:
+ List of output documents
+ """
+ # If no output specs provided, create default output
+ if not output_specs:
+ output_specs = [{
+ "label": "web_research_results.md",
+ "description": "Comprehensive web research results"
+ }]
+
+ # Generate documents
+ documents = []
+
+ # Process each output specification
+ for spec in output_specs:
+ output_label = spec.get("label", "")
+ output_description = spec.get("description", "")
+
+ # Determine format based on file extension
+ format_type = self._determine_format_type(output_label)
+
+ # Create appropriate document based on format
+ if format_type == "json":
+ # JSON output - structured data
+ document = await self._create_json_document(prompt, results, research_plan, output_label)
+ elif format_type == "csv":
+ # CSV output - tabular data
+ document = await self._create_csv_document(results, output_label)
+ else:
+ # Text-based output (markdown, html, text) - narrative report
+ document = await self._create_narrative_document(
+ prompt, results, research_plan, format_type, output_label, output_description
+ )
+
+ documents.append(document)
+
+ return documents
+
+ async def _create_narrative_document(self, prompt: str, results: List[Dict[str, Any]],
+ research_plan: Dict[str, Any], format_type: str,
+ output_label: str, output_description: str) -> Dict[str, Any]:
+ """
+ Create a narrative document (markdown, html, text) from research results.
+
+ Args:
+ prompt: Original research prompt
+ results: Research results
+ research_plan: Research plan
+ format_type: Output format (markdown, html, text)
+ output_label: Output filename
+ output_description: Output description
+
+ Returns:
+ Document object
+ """
+ # Create content based on format
+ if format_type == "markdown":
+ content_type = "text/markdown"
+ template_format = "markdown"
+ elif format_type == "html":
+ content_type = "text/html"
+ template_format = "html"
+ else:
+ content_type = "text/plain"
+ template_format = "text"
+
+ # Prepare research context
+ research_questions = research_plan.get("research_questions", [])
+ search_terms = research_plan.get("search_terms", [])
+
+ # Create document structure based on results
+ sources_summary = []
+ for result in results:
+ sources_summary.append({
+ "title": result.get("title", "Untitled"),
+ "url": result.get("url", ""),
+ "summary": result.get("summary", ""),
+ "snippet": result.get("snippet", "")
+ })
+
+ # Truncate content for prompt
+ sources_json = json.dumps(sources_summary, indent=2)
+ if len(sources_json) > 10000:
+ # Logic to truncate each summary while preserving structure
+ for i in range(len(sources_summary)):
+ if len(sources_json) <= 10000:
+ break
+ # Gradually truncate summaries
+ sources_summary[i]["summary"] = sources_summary[i]["summary"][:500] + "..."
+ sources_json = json.dumps(sources_summary, indent=2)
+
+ # Create report prompt
+ report_prompt = f"""
+ Create a comprehensive {format_type} research report based on the following web research:
+
+ TASK: {prompt}
+
+ RESEARCH QUESTIONS:
+ {', '.join(research_questions)}
+
+ SEARCH TERMS USED:
+ {', '.join(search_terms)}
+
+ SOURCES AND FINDINGS:
+ {sources_json}
+
+ REPORT DETAILS:
+ - Format: {template_format}
+ - Filename: {output_label}
+ - Description: {output_description}
+
+ Create a well-structured report that:
+ 1. Includes an executive summary of key findings
+ 2. Addresses each research question directly
+ 3. Integrates information from all relevant sources
+ 4. Cites sources appropriately for each piece of information
+ 5. Provides a comprehensive synthesis of the research
+ 6. Is formatted professionally and appropriately for {template_format}
+
+ The report should be scholarly, accurate, and focused on the original research task.
+ """
+
+ try:
+ # Generate report with AI
+ report_content = await self.ai_service.call_api([
+ {"role": "system", "content": f"You create professional research reports in {template_format} format."},
+ {"role": "user", "content": report_prompt}
+ ])
+
+ # Convert to HTML if needed
+ if format_type == "html" and not report_content.lower().startswith("Web Research Results{report_content}"
+
+ return {
+ "label": output_label,
+ "content": report_content,
+ "metadata": {
+ "content_type": content_type
+ }
+ }
+
+ except Exception as e:
+ logger.error(f"Error creating narrative document: {str(e)}")
+ # Create error document
+ if format_type == "markdown":
+ content = f"# Web Research Error\n\nAn error occurred: {str(e)}"
+ elif format_type == "html":
+ content = f"Web Research Error
An error occurred: {str(e)}
"
+ else:
+ content = f"WEB RESEARCH ERROR\n\nAn error occurred: {str(e)}"
+
+ return {
+ "label": output_label,
+ "content": content,
+ "metadata": {
+ "content_type": content_type
+ }
+ }
+
+ async def _create_json_document(self, prompt: str, results: List[Dict[str, Any]],
+ research_plan: Dict[str, Any], output_label: str) -> Dict[str, Any]:
+ """
+ Create a JSON document from research results.
+
+ Args:
+ prompt: Original research prompt
+ results: Research results
+ research_plan: Research plan
+ output_label: Output filename
+
+ Returns:
+ Document object
+ """
+ try:
+ # Create structured data
+ sources_data = []
+ for result in results:
+ sources_data.append({
+ "title": result.get("title", "Untitled"),
+ "url": result.get("url", ""),
+ "summary": result.get("summary", ""),
+ "snippet": result.get("snippet", ""),
+ "source_type": result.get("source_type", "")
+ })
+
+ # Create metadata
+ metadata = {
+ "query": prompt,
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+ "research_questions": research_plan.get("research_questions", []),
+ "search_terms": research_plan.get("search_terms", [])
+ }
+
+ # Compile complete report object
+ json_content = {
+ "metadata": metadata,
+ "summary": research_plan.get("feedback", "Web research results"),
+ "sources": sources_data
+ }
+
+ # Convert to JSON string
+ content = json.dumps(json_content, indent=2)
+
+ return {
+ "label": output_label,
+ "content": content,
+ "metadata": {
+ "content_type": "application/json"
+ }
+ }
+
+ except Exception as e:
+ logger.error(f"Error creating JSON document: {str(e)}")
+ return {
+ "label": output_label,
+ "content": json.dumps({"error": str(e)}),
+ "metadata": {
+ "content_type": "application/json"
+ }
+ }
+
+ async def _create_csv_document(self, results: List[Dict[str, Any]], output_label: str) -> Dict[str, Any]:
+ """
+ Create a CSV document from research results.
+
+ Args:
+ results: Research results
+ output_label: Output filename
+
+ Returns:
+ Document object
+ """
+ try:
+ # Create CSV header
+ csv_lines = ["Title,URL,Source Type,Snippet"]
+
+ # Add results
+ for result in results:
+ # Escape CSV fields
+ title = result.get("title", "").replace('"', '""')
+ url = result.get("url", "").replace('"', '""')
+ source_type = result.get("source_type", "").replace('"', '""')
+ snippet = result.get("snippet", "").replace('"', '""')
+
+ csv_lines.append(f'"{title}","{url}","{source_type}","{snippet}"')
+
+ # Combine into CSV content
+ content = "\n".join(csv_lines)
+
+ return {
+ "label": output_label,
+ "content": content,
+ "metadata": {
+ "content_type": "text/csv"
+ }
+ }
+
+ except Exception as e:
+ logger.error(f"Error creating CSV document: {str(e)}")
+ return {
+ "label": output_label,
+ "content": "Error,Error\nFailed to create CSV,{0}".format(str(e)),
+ "metadata": {
+ "content_type": "text/csv"
+ }
+ }
+
def _determine_format_type(self, output_label: str) -> str:
"""
Determine the format type based on the filename.
@@ -259,282 +621,6 @@ class AgentWebcrawler(AgentBase):
# Default to markdown
return "markdown"
- def _format_results_as_markdown(self, results: List[Dict[str, Any]],
- summary: str, headers: Dict[str, str]) -> str:
- """
- Format research results as markdown.
-
- Args:
- results: List of results
- summary: Summary of all results
- headers: Localized headers
-
- Returns:
- Formatted markdown text
- """
- md_content = f"# {headers['web_research_results']}\n\n"
-
- md_content += f"## {headers['summary']}\n\n{summary}\n\n"
-
- if results:
- md_content += f"## {headers['detailed_results']}\n\n"
-
- for i, result in enumerate(results, 1):
- md_content += f"### {i}. {result['title']}\n\n"
- md_content += f"**{headers['url']}**: {result['url']}\n\n"
- md_content += f"**{headers['snippet']}**: {result['snippet']}\n\n"
- md_content += f"**{headers['content']}**: {result['summary']}\n\n"
-
- # Add separator between results (except for the last one)
- if i < len(results):
- md_content += "---\n\n"
-
- return md_content
-
- async def _is_web_research_request(self, prompt: str) -> bool:
- """
- Use AI to determine if a request requires web research.
-
- Args:
- prompt: The user request
-
- Returns:
- True if it is explicitly a web research request, False otherwise
- """
- if not self.ai_service:
- # Fallback to simpler detection if no AI service is available
- return self._simple_web_detection(prompt)
-
- try:
- # Create prompt to analyze if this is a web research request
- analysis_prompt = f"""
- Analyze the following request and determine if it explicitly requires web research or online information.
-
- REQUEST: {prompt}
-
- A request requires web research if:
- 1. It explicitly asks for searching information online
- 2. It contains URLs or references to websites
- 3. It requests current information that would be available on the web
- 4. It asks for information from web sources
- 5. It implicitly requires current information from the internet
-
- Reply ONLY with a single word - either "YES" if web research is required, or "NO" if not.
- """
-
- # Call AI for analysis
- response = await self.ai_service.call_api([
- {"role": "system", "content": "You determine if a request requires web research. Always respond with just YES or NO."},
- {"role": "user", "content": analysis_prompt}
- ])
-
- # Clean response and check
- response = response.strip().upper()
-
- return "YES" in response
-
- except Exception as e:
- # Log error but don't fail, fallback to simpler detection
- logger.warning(f"Error in AI detection of web research requests: {str(e)}")
- return self._simple_web_detection(prompt)
-
- def _simple_web_detection(self, prompt: str) -> bool:
- """
- Simpler fallback method for detecting web research requests based on URLs.
-
- Args:
- prompt: The user request
-
- Returns:
- True if there are clear URL indicators, False otherwise
- """
- # URLs in the request strongly indicate web research
- url_indicators = ["http://", "https://", "www.", ".com", ".org", ".net", ".edu", ".gov"]
- web_terms = ["search", "find online", "look up", "web", "internet", "website"]
-
- # Check for URL patterns in the request
- contains_url = any(indicator in prompt.lower() for indicator in url_indicators)
- contains_web_term = any(term in prompt.lower() for term in web_terms)
-
- return contains_url or contains_web_term
-
- async def _create_search_strategy(self, prompt: str) -> Dict[str, List[str]]:
- """
- Create a search strategy based on the request.
-
- Args:
- prompt: The user request
-
- Returns:
- Search strategy with URLs and search terms
- """
- if not self.ai_service:
- # Fallback to simple strategy
- return {"skey": [prompt], "url": []}
-
- try:
- # AI prompt to create a search strategy
- strategy_prompt = f"""Create a comprehensive web research strategy for the following task:
- '{prompt.replace("'","")}'
-
- Return the results as a Python dictionary with these specific keys:
-
- 'url': A list of up to {self.max_url} specific URLs extracted from the task.
-
- 'skey': A list of up to {self.max_key} key phrases to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
-
- If specific URLs are given and the task only requires analyzing these URLs, leave 'skey' empty.
-
- Format your response as a valid JSON object with these two keys. Don't add any explanatory text.
- """
-
- # Call AI for search strategy
- content_text = await self.ai_service.call_api([
- {"role": "system", "content": "You are a web research expert who develops precise search strategies."},
- {"role": "user", "content": strategy_prompt}
- ])
-
- # Remove JSON code block markers if present
- if content_text.startswith("```json"):
- end_marker = "```"
- end_index = content_text.rfind(end_marker)
- if end_index != -1:
- content_text = content_text[7:end_index].strip()
- elif content_text.startswith("```"):
- end_marker = "```"
- end_index = content_text.rfind(end_marker)
- if end_index != -1:
- content_text = content_text[3:end_index].strip()
-
- # Extract only the JSON part (if surrounded by text)
- json_match = re.search(r'(\{.*\})', content_text, re.DOTALL)
- if json_match:
- content_text = json_match.group(1)
-
- # Parse JSON and return
- strategy = json.loads(content_text)
- return strategy
-
- except Exception as e:
- logger.error(f"Error creating search strategy: {str(e)}")
- # Simple fallback strategy
- return {"skey": [prompt], "url": []}
-
- async def _summarize_result(self, result_data: str, original_prompt: str) -> str:
- """
- Create a summary of a search result using AI.
-
- Args:
- result_data: The data to summarize
- original_prompt: The original request
-
- Returns:
- Summary of the result
- """
- if not self.ai_service:
- return f"Summary of {len(result_data)} characters not available (AI service not available)"
-
- try:
- # Instructions for summarization
- summary_prompt = f"""
- Summarize this search result according to the original request in about 2000 characters.
-
- Original request = '{original_prompt.replace("'","")}'
-
- Focus on the most important findings and connect them to the original request.
- Extract only relevant and high-quality information.
-
- Here's the search result:
- {result_data}
- """
-
- # Call AI for summary
- summary = await self.ai_service.call_api([
- {"role": "system", "content": "You are an information analyst who summarizes web content precisely and relevantly."},
- {"role": "user", "content": summary_prompt}
- ])
-
- # Limit to ~2000 characters
- return summary[:2000]
-
- except Exception as e:
- logger.error(f"Error summarizing result: {str(e)}")
- return "Error creating summary"
-
- async def _get_localized_headers(self, text: str) -> Dict[str, str]:
- """
- Determine localized headers for web research results based on detected language.
-
- Args:
- text: Text for language detection
-
- Returns:
- Dictionary with localized headers
- """
- # Default English headers
- headers = {
- "web_research_results": "Web Research Results",
- "summary": "Summary",
- "detailed_results": "Detailed Results",
- "url": "URL",
- "snippet": "Snippet",
- "content": "Content"
- }
-
- if not self.ai_service:
- return headers
-
- try:
- # Detect language
- language_prompt = f"What language is this text written in? Answer with just the language name: {text[:200]}"
- language = await self.ai_service.call_api([
- {"role": "system", "content": "You determine the language of a text and return only the language name."},
- {"role": "user", "content": language_prompt}
- ])
-
- language = language.strip().lower()
-
- # English language or language detection failed, return default headers
- if language in ["english", "en", ""]:
- return headers
-
- # Translate headers if language recognized but no predefined translation
- translation_prompt = f"""
- Translate these web research result headers to {language}:
-
- Web Research Results
- Summary
- Detailed Results
- URL
- Snippet
- Content
-
- Return a JSON object with these keys:
- web_research_results, summary, detailed_results, url, snippet, content
- """
-
- # Call AI for translation
- response = await self.ai_service.call_api([
- {"role": "system", "content": "You translate headers to the specified language and return them as JSON."},
- {"role": "user", "content": translation_prompt}
- ])
-
- # Extract JSON
- json_match = re.search(r'\{.*\}', response, re.DOTALL)
-
- if json_match:
- try:
- translated_headers = json.loads(json_match.group(0))
- return translated_headers
- except json.JSONDecodeError:
- logger.warning(f"Error parsing translated headers JSON")
-
- except Exception as e:
- # Log error but continue with English headers
- logger.warning(f"Error translating headers: {str(e)}")
-
- return headers
-
def _search_web(self, query: str) -> List[Dict[str, str]]:
"""
Conduct a web search and return the results.
@@ -546,10 +632,10 @@ class AgentWebcrawler(AgentBase):
List of search results
"""
formatted_query = quote_plus(query)
- url = f"{APP_CONFIG.get('Agent_Webcrawler_SEARCH_ENGINE', 'https://html.duckduckgo.com/html/?q=')}{formatted_query}"
+ url = f"{self.search_engine}{formatted_query}"
search_results_soup = self._read_url(url)
- if not isinstance(search_results_soup, BeautifulSoup) or not search_results_soup.select('.result'):
+ if not search_results_soup or not search_results_soup.select('.result'):
logger.warning(f"No search results found for: {query}")
return []
@@ -588,11 +674,13 @@ class AgentWebcrawler(AgentBase):
snippet_element = result.select_one('.result__snippet')
snippet = snippet_element.text.strip() if snippet_element else 'No description'
- # Get actual page content for the data field
- target_page_soup = self._read_url(extracted_url)
-
- # Use new content extraction method to limit content size
- content = self._extract_main_content(target_page_soup)
+ # Get actual page content
+ try:
+ target_page_soup = self._read_url(extracted_url)
+ content = self._extract_main_content(target_page_soup)
+ except Exception as e:
+ logger.warning(f"Error extracting content from {extracted_url}: {str(e)}")
+ content = f"Error extracting content: {str(e)}"
results.append({
'title': title,
@@ -601,8 +689,8 @@ class AgentWebcrawler(AgentBase):
'data': content
})
- # Limit number of results if needed
- if len(results) >= self.max_result:
+ # Limit number of results
+ if len(results) >= self.max_results:
break
return results
@@ -615,10 +703,13 @@ class AgentWebcrawler(AgentBase):
url: The URL to read
Returns:
- BeautifulSoup object with the content or empty on errors
+ BeautifulSoup object with the content or None on errors
"""
+ if not url or not url.startswith(('http://', 'https://')):
+ return None
+
headers = {
- 'User-Agent': APP_CONFIG.get("Agent_Webcrawler_USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"),
+ 'User-Agent': self.user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml',
'Accept-Language': 'en-US,en;q=0.9',
}
@@ -629,18 +720,17 @@ class AgentWebcrawler(AgentBase):
# Handling for status 202
if response.status_code == 202:
- # Max 3 retries with increasing intervals
+ # Retry with backoff
backoff_times = [0.5, 1.0, 2.0, 5.0]
for wait_time in backoff_times:
- time.sleep(wait_time) # Wait with increasing time
+ time.sleep(wait_time)
response = requests.get(url, headers=headers, timeout=self.timeout)
- # If no more 202, break
if response.status_code != 202:
break
- # Raise for other error status codes
+ # Raise for error status codes
response.raise_for_status()
# Parse HTML
@@ -648,8 +738,7 @@ class AgentWebcrawler(AgentBase):
except Exception as e:
logger.error(f"Error reading URL {url}: {str(e)}")
- # Create empty BeautifulSoup object
- return BeautifulSoup("", 'html.parser')
+ return None
def _extract_title(self, soup: BeautifulSoup, url: str) -> str:
"""
@@ -662,7 +751,7 @@ class AgentWebcrawler(AgentBase):
Returns:
Extracted title
"""
- if not isinstance(soup, BeautifulSoup):
+ if not soup:
return f"Error with {url}"
# Extract title from title tag
@@ -688,8 +777,8 @@ class AgentWebcrawler(AgentBase):
Returns:
Extracted main content as a string
"""
- if not isinstance(soup, BeautifulSoup):
- return str(soup)[:max_chars] if soup else ""
+ if not soup:
+ return ""
# Try to find main content elements in priority order
main_content = None
@@ -713,29 +802,6 @@ class AgentWebcrawler(AgentBase):
# Limit to max_chars
return text_content[:max_chars]
- def _parse_result(self, soup: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
- """
- Parse a BeautifulSoup object into a result dictionary.
-
- Args:
- soup: BeautifulSoup object of the webpage
- title: Page title
- url: Page URL
-
- Returns:
- Dictionary with result data
- """
- # Extract content
- content = self._extract_main_content(soup)
-
- result = {
- 'title': title,
- 'url': url,
- 'snippet': 'No description', # Default value
- 'data': content
- }
- return result
-
def _limit_text(self, text: str, max_chars: int = 10000) -> str:
"""
Limit text to a maximum number of characters.
@@ -760,10 +826,5 @@ class AgentWebcrawler(AgentBase):
# Factory function for the Webcrawler agent
def get_webcrawler_agent():
- """
- Factory function that returns an instance of the Webcrawler agent.
-
- Returns:
- An instance of the Webcrawler agent
- """
+ """Returns an instance of the Webcrawler agent."""
return AgentWebcrawler()
\ No newline at end of file
diff --git a/notes/changelog.txt b/notes/changelog.txt
index 3caac4e9..d3381032 100644
--- a/notes/changelog.txt
+++ b/notes/changelog.txt
@@ -25,6 +25,7 @@ streamline self.log_add --> to use in a standardized format and to reduce messag
add connector to myoutlook
+todo an agent for "code writing and editing" connected to the codebase, working in loops over each document...