gateway/gwserver/modules/agentservice_utils.py
2025-04-14 20:05:33 +02:00

759 lines
No EOL
26 KiB
Python

"""
Centralized utility functions for the Agentservice (continued).
"""
import os
import logging
import json
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union, Callable
from io import BytesIO
logger = logging.getLogger(__name__)
class WorkflowUtils:
"""
Utility class for workflow operations.
Centralizes common workflow-related functions.
"""
def __init__(self, workflow_id: str = None):
"""Initialize with optional workflow ID"""
self.workflow_id = workflow_id
def set_workflow_id(self, workflow_id: str):
"""Set or update the workflow ID"""
self.workflow_id = workflow_id
def get_documents(self, workflow: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Get all documents from a workflow across all messages.
Args:
workflow: The workflow object
Returns:
List of document objects
"""
documents = []
# Process all messages
for message in workflow.get("messages", []):
# Extract documents from the message
for doc in message.get("documents", []):
# Add to list if not already present
if not any(d.get("id") == doc.get("id") for d in documents):
documents.append(doc)
return documents
def get_files(self, workflow: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Get all file references from a workflow.
Args:
workflow: The workflow object
Returns:
List of file metadata objects
"""
files = []
# Process all messages
for message in workflow.get("messages", []):
# Extract documents from the message
for doc in message.get("documents", []):
source = doc.get("source", {})
# Only include file documents
if source.get("type") == "file":
file_info = {
"id": source.get("id", ""),
"name": source.get("name", ""),
"type": source.get("content_type", ""),
"content_type": source.get("content_type", ""),
"size": source.get("size", 0)
}
# Check if file is already in the list
if not any(f.get("id") == file_info["id"] for f in files):
files.append(file_info)
return files
def extract_by_prompt(self, workflow: Dict[str, Any], prompt: str, ai_service) -> Dict[str, Any]:
"""
Extract data from workflow documents based on an AI prompt.
Args:
workflow: The workflow object
prompt: The extraction prompt
ai_service: The AI service to use for extraction
Returns:
Extracted data
"""
# This is an async method but we're exposing it as a regular method
# The caller should use it with asyncio.run() or await
async def _extract():
# Create extraction prompt
files = self.get_files(workflow)
file_descriptions = "\n".join([f"- {f.get('name', 'unnamed')} ({f.get('type', 'unknown')})" for f in files])
extraction_prompt = f"""
Extract relevant information from the following files based on this request:
REQUEST: {prompt}
FILES:
{file_descriptions}
Focus on the most relevant content and provide a structured output.
"""
# Call AI
response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])
return {
"prompt": prompt,
"extracted_content": response,
"files_processed": len(files)
}
# Return the coroutine
return _extract()
def merge_workflows(self, workflows: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Merge multiple workflows into a single unified workflow.
Useful for workflow templates or combining partial workflows.
Args:
workflows: List of workflow objects to merge
Returns:
Merged workflow
"""
if not workflows:
return {}
# Start with the first workflow
result = workflows[0].copy()
# Initialize lists if not present
if "messages" not in result:
result["messages"] = []
if "logs" not in result:
result["logs"] = []
# Merge additional workflows
for workflow in workflows[1:]:
# Append messages
for message in workflow.get("messages", []):
# Check for duplicates
if not any(m.get("id") == message.get("id") for m in result["messages"]):
result["messages"].append(message)
# Append logs
for log in workflow.get("logs", []):
# Check for duplicates
if not any(l.get("id") == log.get("id") for l in result["logs"]):
result["logs"].append(log)
# Update status if needed
if workflow.get("status") == "failed":
result["status"] = "failed"
# Update last_activity if newer
if (workflow.get("last_activity") and
(not result.get("last_activity") or
workflow["last_activity"] > result["last_activity"])):
result["last_activity"] = workflow["last_activity"]
return result
def get_message(self, workflow: Dict[str, Any], message_id: str) -> Optional[Dict[str, Any]]:
"""
Find a message by ID in the workflow.
Args:
workflow: The workflow object
message_id: The message ID to find
Returns:
Message object or None if not found
"""
for message in workflow.get("messages", []):
if message.get("id") == message_id:
return message
return None
def to_str(self, workflow: Dict[str, Any]) -> str:
"""
Convert workflow to a formatted string representation.
Args:
workflow: The workflow object
Returns:
String representation of the workflow
"""
# Create a summary string
result = f"Workflow: {workflow.get('id')}\n"
result += f"Status: {workflow.get('status', 'unknown')}\n"
result += f"Started: {workflow.get('started_at', 'unknown')}\n"
result += f"Last Activity: {workflow.get('last_activity', 'unknown')}\n"
# Add message count
message_count = len(workflow.get("messages", []))
result += f"Messages: {message_count}\n"
# Add log count
log_count = len(workflow.get("logs", []))
result += f"Logs: {log_count}\n"
return result
class MessageUtils:
"""
Utility class for message operations.
Centralizes common message-related functions.
"""
def create_message(self, workflow_id: str, role: str = "system") -> Dict[str, Any]:
"""
Create a new message object.
Args:
workflow_id: ID of the workflow
role: Role of the message ('system', 'user', 'assistant')
Returns:
New message object
"""
message_id = f"msg_{uuid.uuid4()}"
current_time = datetime.now().isoformat()
# Create message object
message = {
"id": message_id,
"workflow_id": workflow_id,
"parent_message_id": None,
"started_at": current_time,
"finished_at": None,
"sequence_no": 0,
"status": "pending",
"role": role,
"data_stats": {
"processing_time": 0.0,
"token_count": 0,
"bytes_sent": 0,
"bytes_received": 0
},
"documents": [],
"content": None,
"agent_type": None
}
return message
def finalize_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
"""
Finalize a message by setting completion timestamp.
Args:
message: The message object
Returns:
Updated message object
"""
message["finished_at"] = datetime.now().isoformat()
message["status"] = "completed"
return message
def get_documents(self, message: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Get all documents from a message.
Args:
message: The message object
Returns:
List of document objects
"""
return message.get("documents", [])
def get_files(self, message: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Get all file references from a message.
Args:
message: The message object
Returns:
List of file metadata objects
"""
files = []
# Extract documents from the message
for doc in message.get("documents", []):
source = doc.get("source", {})
# Only include file documents
if source.get("type") == "file":
file_info = {
"id": source.get("id", ""),
"name": source.get("name", ""),
"type": source.get("content_type", ""),
"content_type": source.get("content_type", ""),
"size": source.get("size", 0)
}
files.append(file_info)
return files
def extract_text_content(self, message: Dict[str, Any]) -> str:
"""
Extract text content from a message including document content.
Args:
message: The message object
Returns:
String with all text content from the message
"""
content = message.get("content", "")
# Add document content
for doc in message.get("documents", []):
# Check for document contents
for doc_content in doc.get("contents", []):
if doc_content.get("type") == "text":
content += "\n\n" + doc_content.get("text", "")
return content
def to_str(self, message: Dict[str, Any]) -> str:
"""
Convert message to a formatted string representation.
Args:
message: The message object
Returns:
String representation of the message
"""
# Create a summary string
result = f"Message: {message.get('id')}\n"
result += f"Role: {message.get('role', 'unknown')}\n"
# Add agent info if available
if message.get("agent_type"):
result += f"Agent: {message.get('agent_name', message.get('agent_type', 'unknown'))}\n"
# Add content summary
content = message.get("content", "")
if content:
content_preview = content[:100] + "..." if len(content) > 100 else content
result += f"Content: {content_preview}\n"
# Add document count
doc_count = len(message.get("documents", []))
result += f"Documents: {doc_count}\n"
return result
class FileUtils:
"""
Utility class for file operations.
Centralizes common file-related functions.
"""
def is_text_extractable(self, file_name: str, content_type: str = None) -> bool:
"""
Check if text can be extracted from a file.
Args:
file_name: Name of the file
content_type: MIME type (optional)
Returns:
True if text can be extracted, False otherwise
"""
# Text files
if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv')):
return True
# Excel files
if file_name.endswith(('.xlsx', '.xls')):
try:
import pandas
return True
except ImportError:
return False
# PDF files
if file_name.endswith('.pdf'):
try:
# Check if PyPDF2 or PyMuPDF is available
try:
import PyPDF2
return True
except ImportError:
try:
import fitz # PyMuPDF
return True
except ImportError:
return False
except:
return False
# Images and other non-text files
if file_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg',
'.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv',
'.mp3', '.wav', '.ogg', '.flac', '.aac')):
return False
# Check content type if file extension doesn't give a clear answer
if content_type:
if content_type.startswith(('text/', 'application/json', 'application/xml')):
return True
elif content_type == 'application/pdf':
return True
elif content_type.startswith(('image/', 'video/', 'audio/')):
return False
# Default to allowing extraction attempt
return True
def get_mime_type(self, file_name: str) -> str:
"""
Get MIME type based on file name.
Args:
file_name: Name of the file
Returns:
MIME type string
"""
import mimetypes
# Initialize mimetypes
mimetypes.init()
# Get MIME type
mime_type, _ = mimetypes.guess_type(file_name)
if not mime_type:
# Default mappings for common extensions
extension_map = {
'txt': 'text/plain',
'md': 'text/markdown',
'json': 'application/json',
'csv': 'text/csv',
'html': 'text/html',
'htm': 'text/html',
'pdf': 'application/pdf',
'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'jpg': 'image/jpeg',
'jpeg': 'image/jpeg',
'png': 'image/png',
'gif': 'image/gif',
'svg': 'image/svg+xml',
'webp': 'image/webp',
'mp4': 'video/mp4',
'mp3': 'audio/mpeg'
}
# Get extension
ext = os.path.splitext(file_name)[1].lower().lstrip('.')
# Return mapped MIME type or default
mime_type = extension_map.get(ext, 'application/octet-stream')
return mime_type
class LoggingUtils:
"""
Enhanced logging utilities for better workflow tracking.
Provides structured and categorized logging for workflows.
"""
def __init__(self, workflow_id: str = None, log_func: Callable = None):
"""
Initialize logging utilities.
Args:
workflow_id: ID of the workflow for context
log_func: Function to call for adding workflow logs
"""
self.workflow_id = workflow_id
self.log_func = log_func
self.logger = logging.getLogger(__name__)
# Define log categories
self.categories = {
"workflow": "Workflow Management",
"planning": "Activity Planning",
"execution": "Activity Execution",
"agents": "Agent Selection & Execution",
"files": "File Processing",
"summary": "Results Summary",
"error": "Error Handling",
"code": "Code Execution",
}
def set_workflow_id(self, workflow_id: str):
"""Update the workflow ID"""
self.workflow_id = workflow_id
def set_log_func(self, log_func: Callable):
"""Update the log function"""
self.log_func = log_func
def info(self, message: str, category: str = "workflow", details: str = None):
"""
Log an informational message.
Args:
message: The log message
category: Log category
details: Optional detailed information
"""
category_name = self.categories.get(category, category)
log_message = f"[{category_name}] {message}"
# Log to standard logger
self.logger.info(log_message)
# Log to workflow if function available
if self.log_func and self.workflow_id:
self.log_func(self.workflow_id, message, "info", category, category_name)
def warning(self, message: str, category: str = "workflow", details: str = None):
"""
Log a warning message.
Args:
message: The log message
category: Log category
details: Optional detailed information
"""
category_name = self.categories.get(category, category)
log_message = f"[{category_name}] {message}"
# Log to standard logger
self.logger.warning(log_message)
# Log to workflow if function available
if self.log_func and self.workflow_id:
self.log_func(self.workflow_id, message, "warning", category, category_name)
def error(self, message: str, category: str = "error", details: str = None):
"""
Log an error message.
Args:
message: The log message
category: Log category
details: Optional detailed information
"""
category_name = self.categories.get(category, category)
log_message = f"[{category_name}] {message}"
# Log to standard logger
self.logger.error(log_message)
# Log to workflow if function available
if self.log_func and self.workflow_id:
self.log_func(self.workflow_id, message, "error", category, category_name)
def debug(self, message: str, category: str = "workflow", details: str = None):
"""
Log a debug message.
Args:
message: The log message
category: Log category
details: Optional detailed information
"""
category_name = self.categories.get(category, category)
log_message = f"[{category_name}] {message}"
# Log to standard logger
self.logger.debug(log_message)
def get_category_name(self, category: str) -> str:
"""
Get human-readable category name.
Args:
category: Category code
Returns:
Human-readable category name
"""
return self.categories.get(category, category)
def extract_text_from_file_content(file_content: bytes, file_name: str, content_type: str = None) -> Tuple[str, bool]:
"""
Extract text from various file formats based on binary content.
Args:
file_content: Binary content of the file
file_name: Name of the file for format detection
content_type: Optional MIME type of the file
Returns:
Tuple with (extracted text, is_extracted flag)
"""
# Check if file is likely text-extractable
if not is_text_extractable(file_name, content_type):
return f"[File: {file_name} - Text extraction not supported]", False
try:
# Simple text files
if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv', '.log', '.ini', '.cfg', '.conf')) or (content_type and (content_type.startswith('text/') or content_type in ['application/json', 'application/xml', 'text/csv'])):
try:
return file_content.decode('utf-8'), True
except UnicodeDecodeError:
try:
return file_content.decode('latin1'), True
except:
return file_content.decode('cp1252', errors='replace'), True
# Excel files
elif file_name.endswith(('.xlsx', '.xls')):
try:
import pandas as pd
# Create temporary in-memory file
file_obj = BytesIO(file_content)
df = pd.read_excel(file_obj)
result = f"Excel file with {len(df)} rows and {len(df.columns)} columns.\n"
result += f"Columns: {', '.join(df.columns.tolist())}\n\n"
result += df.to_string(index=False)
return result, True
except ImportError:
return f"[Excel file: {file_name} - pandas not installed]", False
except Exception as e:
return f"[Error extracting Excel content: {str(e)}]", False
# CSV files
elif file_name.endswith('.csv'):
try:
import pandas as pd
try:
# Create temporary in-memory file
file_obj = BytesIO(file_content)
df = pd.read_csv(file_obj, encoding='utf-8')
except UnicodeDecodeError:
file_obj = BytesIO(file_content)
try:
df = pd.read_csv(file_obj, encoding='latin1')
except:
file_obj = BytesIO(file_content)
df = pd.read_csv(file_obj, encoding='cp1252')
result = f"CSV file with {len(df)} rows and {len(df.columns)} columns.\n"
result += f"Columns: {', '.join(df.columns.tolist())}\n\n"
result += df.to_string(index=False)
return result, True
except ImportError:
return f"[CSV file: {file_name} - pandas not installed]", False
except Exception as e:
return f"[Error extracting CSV content: {str(e)}]", False
# PDF files
elif file_name.endswith('.pdf'):
try:
try:
from PyPDF2 import PdfReader
reader = PdfReader(BytesIO(file_content))
text = ""
for page in reader.pages:
text += page.extract_text() + "\n\n"
return text, True
except ImportError:
try:
import fitz # PyMuPDF
doc = fitz.open(stream=file_content, filetype="pdf")
text = ""
for page in doc:
text += page.get_text() + "\n\n"
return text, True
except ImportError:
return f"[PDF: {file_name} - No PDF library installed]", False
except Exception as e:
return f"[Error reading PDF file {file_name}: {str(e)}]", False
# Default case - try basic text extraction
else:
try:
return file_content.decode('utf-8', errors='replace'), True
except Exception as e:
logger.error(f"Error extracting text from {file_name}: {str(e)}")
return f"[Text extraction error: {str(e)}]", False
except Exception as e:
logger.error(f"Error extracting text from {file_name}: {str(e)}")
return f"[Text extraction error: {str(e)}]", False
def is_text_extractable(file_name: str, content_type: str = None) -> bool:
"""Check if text can be extracted from a file."""
# Text files
if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv')):
return True
# Excel files
if file_name.endswith(('.xlsx', '.xls')):
try:
import pandas
return True
except ImportError:
return False
# PDF files
if file_name.endswith('.pdf'):
try:
# Check if PyPDF2 or PyMuPDF is available
try:
import PyPDF2
return True
except ImportError:
try:
import fitz # PyMuPDF
return True
except ImportError:
return False
except:
return False
# Images and other non-text files
if file_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg',
'.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv',
'.mp3', '.wav', '.ogg', '.flac', '.aac')):
return False
# Check content type if file extension doesn't give a clear answer
if content_type:
if content_type.startswith(('text/', 'application/json', 'application/xml')):
return True
elif content_type == 'application/pdf':
return True
elif content_type.startswith(('image/', 'video/', 'audio/')):
return False
# Default to allowing extraction attempt
return True