""" Centralized utility functions for the Agentservice (continued). """ import os import logging import json import uuid from datetime import datetime from typing import List, Dict, Any, Optional, Tuple, Union, Callable from io import BytesIO logger = logging.getLogger(__name__) class WorkflowUtils: """ Utility class for workflow operations. Centralizes common workflow-related functions. """ def __init__(self, workflow_id: str = None): """Initialize with optional workflow ID""" self.workflow_id = workflow_id def set_workflow_id(self, workflow_id: str): """Set or update the workflow ID""" self.workflow_id = workflow_id def get_documents(self, workflow: Dict[str, Any]) -> List[Dict[str, Any]]: """ Get all documents from a workflow across all messages. Args: workflow: The workflow object Returns: List of document objects """ documents = [] # Process all messages for message in workflow.get("messages", []): # Extract documents from the message for doc in message.get("documents", []): # Add to list if not already present if not any(d.get("id") == doc.get("id") for d in documents): documents.append(doc) return documents def get_files(self, workflow: Dict[str, Any]) -> List[Dict[str, Any]]: """ Get all file references from a workflow. Args: workflow: The workflow object Returns: List of file metadata objects """ files = [] # Process all messages for message in workflow.get("messages", []): # Extract documents from the message for doc in message.get("documents", []): source = doc.get("source", {}) # Only include file documents if source.get("type") == "file": file_info = { "id": source.get("id", ""), "name": source.get("name", ""), "type": source.get("content_type", ""), "content_type": source.get("content_type", ""), "size": source.get("size", 0) } # Check if file is already in the list if not any(f.get("id") == file_info["id"] for f in files): files.append(file_info) return files def extract_by_prompt(self, workflow: Dict[str, Any], prompt: str, ai_service) -> Dict[str, Any]: """ Extract data from workflow documents based on an AI prompt. Args: workflow: The workflow object prompt: The extraction prompt ai_service: The AI service to use for extraction Returns: Extracted data """ # This is an async method but we're exposing it as a regular method # The caller should use it with asyncio.run() or await async def _extract(): # Create extraction prompt files = self.get_files(workflow) file_descriptions = "\n".join([f"- {f.get('name', 'unnamed')} ({f.get('type', 'unknown')})" for f in files]) extraction_prompt = f""" Extract relevant information from the following files based on this request: REQUEST: {prompt} FILES: {file_descriptions} Focus on the most relevant content and provide a structured output. """ # Call AI response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}]) return { "prompt": prompt, "extracted_content": response, "files_processed": len(files) } # Return the coroutine return _extract() def merge_workflows(self, workflows: List[Dict[str, Any]]) -> Dict[str, Any]: """ Merge multiple workflows into a single unified workflow. Useful for workflow templates or combining partial workflows. Args: workflows: List of workflow objects to merge Returns: Merged workflow """ if not workflows: return {} # Start with the first workflow result = workflows[0].copy() # Initialize lists if not present if "messages" not in result: result["messages"] = [] if "logs" not in result: result["logs"] = [] # Merge additional workflows for workflow in workflows[1:]: # Append messages for message in workflow.get("messages", []): # Check for duplicates if not any(m.get("id") == message.get("id") for m in result["messages"]): result["messages"].append(message) # Append logs for log in workflow.get("logs", []): # Check for duplicates if not any(l.get("id") == log.get("id") for l in result["logs"]): result["logs"].append(log) # Update status if needed if workflow.get("status") == "failed": result["status"] = "failed" # Update last_activity if newer if (workflow.get("last_activity") and (not result.get("last_activity") or workflow["last_activity"] > result["last_activity"])): result["last_activity"] = workflow["last_activity"] return result def get_message(self, workflow: Dict[str, Any], message_id: str) -> Optional[Dict[str, Any]]: """ Find a message by ID in the workflow. Args: workflow: The workflow object message_id: The message ID to find Returns: Message object or None if not found """ for message in workflow.get("messages", []): if message.get("id") == message_id: return message return None def to_str(self, workflow: Dict[str, Any]) -> str: """ Convert workflow to a formatted string representation. Args: workflow: The workflow object Returns: String representation of the workflow """ # Create a summary string result = f"Workflow: {workflow.get('id')}\n" result += f"Status: {workflow.get('status', 'unknown')}\n" result += f"Started: {workflow.get('started_at', 'unknown')}\n" result += f"Last Activity: {workflow.get('last_activity', 'unknown')}\n" # Add message count message_count = len(workflow.get("messages", [])) result += f"Messages: {message_count}\n" # Add log count log_count = len(workflow.get("logs", [])) result += f"Logs: {log_count}\n" return result class MessageUtils: """ Utility class for message operations. Centralizes common message-related functions. """ def create_message(self, workflow_id: str, role: str = "system") -> Dict[str, Any]: """ Create a new message object. Args: workflow_id: ID of the workflow role: Role of the message ('system', 'user', 'assistant') Returns: New message object """ message_id = f"msg_{uuid.uuid4()}" current_time = datetime.now().isoformat() # Create message object message = { "id": message_id, "workflow_id": workflow_id, "parent_message_id": None, "started_at": current_time, "finished_at": None, "sequence_no": 0, "status": "pending", "role": role, "data_stats": { "processing_time": 0.0, "token_count": 0, "bytes_sent": 0, "bytes_received": 0 }, "documents": [], "content": None, "agent_type": None } return message def finalize_message(self, message: Dict[str, Any]) -> Dict[str, Any]: """ Finalize a message by setting completion timestamp. Args: message: The message object Returns: Updated message object """ message["finished_at"] = datetime.now().isoformat() message["status"] = "completed" return message def get_documents(self, message: Dict[str, Any]) -> List[Dict[str, Any]]: """ Get all documents from a message. Args: message: The message object Returns: List of document objects """ return message.get("documents", []) def get_files(self, message: Dict[str, Any]) -> List[Dict[str, Any]]: """ Get all file references from a message. Args: message: The message object Returns: List of file metadata objects """ files = [] # Extract documents from the message for doc in message.get("documents", []): source = doc.get("source", {}) # Only include file documents if source.get("type") == "file": file_info = { "id": source.get("id", ""), "name": source.get("name", ""), "type": source.get("content_type", ""), "content_type": source.get("content_type", ""), "size": source.get("size", 0) } files.append(file_info) return files def extract_text_content(self, message: Dict[str, Any]) -> str: """ Extract text content from a message including document content. Args: message: The message object Returns: String with all text content from the message """ content = message.get("content", "") # Add document content for doc in message.get("documents", []): # Check for document contents for doc_content in doc.get("contents", []): if doc_content.get("type") == "text": content += "\n\n" + doc_content.get("text", "") return content def to_str(self, message: Dict[str, Any]) -> str: """ Convert message to a formatted string representation. Args: message: The message object Returns: String representation of the message """ # Create a summary string result = f"Message: {message.get('id')}\n" result += f"Role: {message.get('role', 'unknown')}\n" # Add agent info if available if message.get("agent_type"): result += f"Agent: {message.get('agent_name', message.get('agent_type', 'unknown'))}\n" # Add content summary content = message.get("content", "") if content: content_preview = content[:100] + "..." if len(content) > 100 else content result += f"Content: {content_preview}\n" # Add document count doc_count = len(message.get("documents", [])) result += f"Documents: {doc_count}\n" return result class FileUtils: """ Utility class for file operations. Centralizes common file-related functions. """ def is_text_extractable(self, file_name: str, content_type: str = None) -> bool: """ Check if text can be extracted from a file. Args: file_name: Name of the file content_type: MIME type (optional) Returns: True if text can be extracted, False otherwise """ # Text files if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv')): return True # Excel files if file_name.endswith(('.xlsx', '.xls')): try: import pandas return True except ImportError: return False # PDF files if file_name.endswith('.pdf'): try: # Check if PyPDF2 or PyMuPDF is available try: import PyPDF2 return True except ImportError: try: import fitz # PyMuPDF return True except ImportError: return False except: return False # Images and other non-text files if file_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.mp3', '.wav', '.ogg', '.flac', '.aac')): return False # Check content type if file extension doesn't give a clear answer if content_type: if content_type.startswith(('text/', 'application/json', 'application/xml')): return True elif content_type == 'application/pdf': return True elif content_type.startswith(('image/', 'video/', 'audio/')): return False # Default to allowing extraction attempt return True def get_mime_type(self, file_name: str) -> str: """ Get MIME type based on file name. Args: file_name: Name of the file Returns: MIME type string """ import mimetypes # Initialize mimetypes mimetypes.init() # Get MIME type mime_type, _ = mimetypes.guess_type(file_name) if not mime_type: # Default mappings for common extensions extension_map = { 'txt': 'text/plain', 'md': 'text/markdown', 'json': 'application/json', 'csv': 'text/csv', 'html': 'text/html', 'htm': 'text/html', 'pdf': 'application/pdf', 'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg', 'png': 'image/png', 'gif': 'image/gif', 'svg': 'image/svg+xml', 'webp': 'image/webp', 'mp4': 'video/mp4', 'mp3': 'audio/mpeg' } # Get extension ext = os.path.splitext(file_name)[1].lower().lstrip('.') # Return mapped MIME type or default mime_type = extension_map.get(ext, 'application/octet-stream') return mime_type class LoggingUtils: """ Enhanced logging utilities for better workflow tracking. Provides structured and categorized logging for workflows. """ def __init__(self, workflow_id: str = None, log_func: Callable = None): """ Initialize logging utilities. Args: workflow_id: ID of the workflow for context log_func: Function to call for adding workflow logs """ self.workflow_id = workflow_id self.log_func = log_func self.logger = logging.getLogger(__name__) # Define log categories self.categories = { "workflow": "Workflow Management", "planning": "Activity Planning", "execution": "Activity Execution", "agents": "Agent Selection & Execution", "files": "File Processing", "summary": "Results Summary", "error": "Error Handling", "code": "Code Execution", } def set_workflow_id(self, workflow_id: str): """Update the workflow ID""" self.workflow_id = workflow_id def set_log_func(self, log_func: Callable): """Update the log function""" self.log_func = log_func def info(self, message: str, category: str = "workflow", details: str = None): """ Log an informational message. Args: message: The log message category: Log category details: Optional detailed information """ category_name = self.categories.get(category, category) log_message = f"[{category_name}] {message}" # Log to standard logger self.logger.info(log_message) # Log to workflow if function available if self.log_func and self.workflow_id: self.log_func(self.workflow_id, message, "info", category, category_name) def warning(self, message: str, category: str = "workflow", details: str = None): """ Log a warning message. Args: message: The log message category: Log category details: Optional detailed information """ category_name = self.categories.get(category, category) log_message = f"[{category_name}] {message}" # Log to standard logger self.logger.warning(log_message) # Log to workflow if function available if self.log_func and self.workflow_id: self.log_func(self.workflow_id, message, "warning", category, category_name) def error(self, message: str, category: str = "error", details: str = None): """ Log an error message. Args: message: The log message category: Log category details: Optional detailed information """ category_name = self.categories.get(category, category) log_message = f"[{category_name}] {message}" # Log to standard logger self.logger.error(log_message) # Log to workflow if function available if self.log_func and self.workflow_id: self.log_func(self.workflow_id, message, "error", category, category_name) def debug(self, message: str, category: str = "workflow", details: str = None): """ Log a debug message. Args: message: The log message category: Log category details: Optional detailed information """ category_name = self.categories.get(category, category) log_message = f"[{category_name}] {message}" # Log to standard logger self.logger.debug(log_message) def get_category_name(self, category: str) -> str: """ Get human-readable category name. Args: category: Category code Returns: Human-readable category name """ return self.categories.get(category, category) def extract_text_from_file_content(file_content: bytes, file_name: str, content_type: str = None) -> Tuple[str, bool]: """ Extract text from various file formats based on binary content. Args: file_content: Binary content of the file file_name: Name of the file for format detection content_type: Optional MIME type of the file Returns: Tuple with (extracted text, is_extracted flag) """ # Check if file is likely text-extractable if not is_text_extractable(file_name, content_type): return f"[File: {file_name} - Text extraction not supported]", False try: # Simple text files if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv', '.log', '.ini', '.cfg', '.conf')) or (content_type and (content_type.startswith('text/') or content_type in ['application/json', 'application/xml', 'text/csv'])): try: return file_content.decode('utf-8'), True except UnicodeDecodeError: try: return file_content.decode('latin1'), True except: return file_content.decode('cp1252', errors='replace'), True # Excel files elif file_name.endswith(('.xlsx', '.xls')): try: import pandas as pd # Create temporary in-memory file file_obj = BytesIO(file_content) df = pd.read_excel(file_obj) result = f"Excel file with {len(df)} rows and {len(df.columns)} columns.\n" result += f"Columns: {', '.join(df.columns.tolist())}\n\n" result += df.to_string(index=False) return result, True except ImportError: return f"[Excel file: {file_name} - pandas not installed]", False except Exception as e: return f"[Error extracting Excel content: {str(e)}]", False # CSV files elif file_name.endswith('.csv'): try: import pandas as pd try: # Create temporary in-memory file file_obj = BytesIO(file_content) df = pd.read_csv(file_obj, encoding='utf-8') except UnicodeDecodeError: file_obj = BytesIO(file_content) try: df = pd.read_csv(file_obj, encoding='latin1') except: file_obj = BytesIO(file_content) df = pd.read_csv(file_obj, encoding='cp1252') result = f"CSV file with {len(df)} rows and {len(df.columns)} columns.\n" result += f"Columns: {', '.join(df.columns.tolist())}\n\n" result += df.to_string(index=False) return result, True except ImportError: return f"[CSV file: {file_name} - pandas not installed]", False except Exception as e: return f"[Error extracting CSV content: {str(e)}]", False # PDF files elif file_name.endswith('.pdf'): try: try: from PyPDF2 import PdfReader reader = PdfReader(BytesIO(file_content)) text = "" for page in reader.pages: text += page.extract_text() + "\n\n" return text, True except ImportError: try: import fitz # PyMuPDF doc = fitz.open(stream=file_content, filetype="pdf") text = "" for page in doc: text += page.get_text() + "\n\n" return text, True except ImportError: return f"[PDF: {file_name} - No PDF library installed]", False except Exception as e: return f"[Error reading PDF file {file_name}: {str(e)}]", False # Default case - try basic text extraction else: try: return file_content.decode('utf-8', errors='replace'), True except Exception as e: logger.error(f"Error extracting text from {file_name}: {str(e)}") return f"[Text extraction error: {str(e)}]", False except Exception as e: logger.error(f"Error extracting text from {file_name}: {str(e)}") return f"[Text extraction error: {str(e)}]", False def is_text_extractable(file_name: str, content_type: str = None) -> bool: """Check if text can be extracted from a file.""" # Text files if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv')): return True # Excel files if file_name.endswith(('.xlsx', '.xls')): try: import pandas return True except ImportError: return False # PDF files if file_name.endswith('.pdf'): try: # Check if PyPDF2 or PyMuPDF is available try: import PyPDF2 return True except ImportError: try: import fitz # PyMuPDF return True except ImportError: return False except: return False # Images and other non-text files if file_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.mp3', '.wav', '.ogg', '.flac', '.aac')): return False # Check content type if file extension doesn't give a clear answer if content_type: if content_type.startswith(('text/', 'application/json', 'application/xml')): return True elif content_type == 'application/pdf': return True elif content_type.startswith(('image/', 'video/', 'audio/')): return False # Default to allowing extraction attempt return True