gateway/gwserver/modules/agentservice_utils.py

"""
Centralized utility functions for the Agentservice (continued).
"""

import os
import logging
import json
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union, Callable
from io import BytesIO

logger = logging.getLogger(__name__)

class WorkflowUtils:
    """
    Utility class for workflow operations.
    Centralizes common workflow-related functions.
    """

    def __init__(self, workflow_id: str = None):
        """Initialize with optional workflow ID"""
        self.workflow_id = workflow_id

    def set_workflow_id(self, workflow_id: str):
        """Set or update the workflow ID"""
        self.workflow_id = workflow_id

    def get_documents(self, workflow: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Get all documents from a workflow across all messages.

        Args:
            workflow: The workflow object

        Returns:
            List of document objects
        """
        documents = []

        # Process all messages
        for message in workflow.get("messages", []):
            # Extract documents from the message
            for doc in message.get("documents", []):
                # Add to list if not already present
                if not any(d.get("id") == doc.get("id") for d in documents):
                    documents.append(doc)

        return documents

    def get_files(self, workflow: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Get all file references from a workflow.

        Args:
            workflow: The workflow object

        Returns:
            List of file metadata objects
        """
        files = []

        # Process all messages
        for message in workflow.get("messages", []):
            # Extract documents from the message
            for doc in message.get("documents", []):
                source = doc.get("source", {})

                # Only include file documents
                if source.get("type") == "file":
                    file_info = {
                        "id": source.get("id", ""),
                        "name": source.get("name", ""),
                        "type": source.get("content_type", ""),
                        "content_type": source.get("content_type", ""),
                        "size": source.get("size", 0)
                    }

                    # Check if file is already in the list
                    if not any(f.get("id") == file_info["id"] for f in files):
                        files.append(file_info)

        return files

    def extract_by_prompt(self, workflow: Dict[str, Any], prompt: str, ai_service) -> Dict[str, Any]:
        """
        Extract data from workflow documents based on an AI prompt.

        Args:
            workflow: The workflow object
            prompt: The extraction prompt
            ai_service: The AI service to use for extraction

        Returns:
            Extracted data
        """
        # This is an async method but we're exposing it as a regular method
        # The caller should use it with asyncio.run() or await
        async def _extract():
            # Create extraction prompt
            files = self.get_files(workflow)
            file_descriptions = "\n".join([f"- {f.get('name', 'unnamed')} ({f.get('type', 'unknown')})" for f in files])

            extraction_prompt = f"""
            Extract relevant information from the following files based on this request:

            REQUEST: {prompt}

            FILES:
            {file_descriptions}

            Focus on the most relevant content and provide a structured output.
            """

            # Call AI
            response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])

            return {
                "prompt": prompt,
                "extracted_content": response,
                "files_processed": len(files)
            }

        # Return the coroutine
        return _extract()

    def merge_workflows(self, workflows: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Merge multiple workflows into a single unified workflow.
        Useful for workflow templates or combining partial workflows.

        Args:
            workflows: List of workflow objects to merge

        Returns:
            Merged workflow
        """
        if not workflows:
            return {}

        # Start with the first workflow
        result = workflows[0].copy()

        # Initialize lists if not present
        if "messages" not in result:
            result["messages"] = []
        if "logs" not in result:
            result["logs"] = []

        # Merge additional workflows
        for workflow in workflows[1:]:
            # Append messages
            for message in workflow.get("messages", []):
                # Check for duplicates
                if not any(m.get("id") == message.get("id") for m in result["messages"]):
                    result["messages"].append(message)

            # Append logs
            for log in workflow.get("logs", []):
                # Check for duplicates
                if not any(l.get("id") == log.get("id") for l in result["logs"]):
                    result["logs"].append(log)

            # Update status if needed
            if workflow.get("status") == "failed":
                result["status"] = "failed"

            # Update last_activity if newer
            if (workflow.get("last_activity") and
                (not result.get("last_activity") or
                 workflow["last_activity"] > result["last_activity"])):
                result["last_activity"] = workflow["last_activity"]

        return result

    def get_message(self, workflow: Dict[str, Any], message_id: str) -> Optional[Dict[str, Any]]:
        """
        Find a message by ID in the workflow.

        Args:
            workflow: The workflow object
            message_id: The message ID to find

        Returns:
            Message object or None if not found
        """
        for message in workflow.get("messages", []):
            if message.get("id") == message_id:
                return message
        return None

    def to_str(self, workflow: Dict[str, Any]) -> str:
        """
        Convert workflow to a formatted string representation.

        Args:
            workflow: The workflow object

        Returns:
            String representation of the workflow
        """
        # Create a summary string
        result = f"Workflow: {workflow.get('id')}\n"
        result += f"Status: {workflow.get('status', 'unknown')}\n"
        result += f"Started: {workflow.get('started_at', 'unknown')}\n"
        result += f"Last Activity: {workflow.get('last_activity', 'unknown')}\n"

        # Add message count
        message_count = len(workflow.get("messages", []))
        result += f"Messages: {message_count}\n"

        # Add log count
        log_count = len(workflow.get("logs", []))
        result += f"Logs: {log_count}\n"

        return result


class MessageUtils:
    """
    Utility class for message operations.
    Centralizes common message-related functions.
    """

    def create_message(self, workflow_id: str, role: str = "system") -> Dict[str, Any]:
        """
        Create a new message object.

        Args:
            workflow_id: ID of the workflow
            role: Role of the message ('system', 'user', 'assistant')

        Returns:
            New message object
        """
        message_id = f"msg_{uuid.uuid4()}"
        current_time = datetime.now().isoformat()

        # Create message object
        message = {
            "id": message_id,
            "workflow_id": workflow_id,
            "parent_message_id": None,
            "started_at": current_time,
            "finished_at": None,
            "sequence_no": 0,

            "status": "pending",
            "role": role,

            "data_stats": {
                "processing_time": 0.0,
                "token_count": 0,
                "bytes_sent": 0,
                "bytes_received": 0
            },

            "documents": [],
            "content": None,
            "agent_type": None
        }

        return message

    def finalize_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
        """
        Finalize a message by setting completion timestamp.

        Args:
            message: The message object

        Returns:
            Updated message object
        """
        message["finished_at"] = datetime.now().isoformat()
        message["status"] = "completed"
        return message

    def get_documents(self, message: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Get all documents from a message.

        Args:
            message: The message object

        Returns:
            List of document objects
        """
        return message.get("documents", [])

    def get_files(self, message: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Get all file references from a message.

        Args:
            message: The message object

        Returns:
            List of file metadata objects
        """
        files = []

        # Extract documents from the message
        for doc in message.get("documents", []):
            source = doc.get("source", {})

            # Only include file documents
            if source.get("type") == "file":
                file_info = {
                    "id": source.get("id", ""),
                    "name": source.get("name", ""),
                    "type": source.get("content_type", ""),
                    "content_type": source.get("content_type", ""),
                    "size": source.get("size", 0)
                }

                files.append(file_info)

        return files

    def extract_text_content(self, message: Dict[str, Any]) -> str:
        """
        Extract text content from a message including document content.

        Args:
            message: The message object

        Returns:
            String with all text content from the message
        """
        content = message.get("content", "")

        # Add document content
        for doc in message.get("documents", []):
            # Check for document contents
            for doc_content in doc.get("contents", []):
                if doc_content.get("type") == "text":
                    content += "\n\n" + doc_content.get("text", "")

        return content

    def to_str(self, message: Dict[str, Any]) -> str:
        """
        Convert message to a formatted string representation.

        Args:
            message: The message object

        Returns:
            String representation of the message
        """
        # Create a summary string
        result = f"Message: {message.get('id')}\n"
        result += f"Role: {message.get('role', 'unknown')}\n"

        # Add agent info if available
        if message.get("agent_type"):
            result += f"Agent: {message.get('agent_name', message.get('agent_type', 'unknown'))}\n"

        # Add content summary
        content = message.get("content", "")
        if content:
            content_preview = content[:100] + "..." if len(content) > 100 else content
            result += f"Content: {content_preview}\n"

        # Add document count
        doc_count = len(message.get("documents", []))
        result += f"Documents: {doc_count}\n"

        return result


class FileUtils:
    """
    Utility class for file operations.
    Centralizes common file-related functions.
    """

    def is_text_extractable(self, file_name: str, content_type: str = None) -> bool:
        """
        Check if text can be extracted from a file.

        Args:
            file_name: Name of the file
            content_type: MIME type (optional)

        Returns:
            True if text can be extracted, False otherwise
        """
        # Text files
        if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv')):
            return True

        # Excel files
        if file_name.endswith(('.xlsx', '.xls')):
            try:
                import pandas
                return True
            except ImportError:
                return False

        # PDF files
        if file_name.endswith('.pdf'):
            try:
                # Check if PyPDF2 or PyMuPDF is available
                try:
                    import PyPDF2
                    return True
                except ImportError:
                    try:
                        import fitz  # PyMuPDF
                        return True
                    except ImportError:
                        return False
            except:
                return False

        # Images and other non-text files
        if file_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg',
                              '.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv',
                              '.mp3', '.wav', '.ogg', '.flac', '.aac')):
            return False

        # Check content type if file extension doesn't give a clear answer
        if content_type:
            if content_type.startswith(('text/', 'application/json', 'application/xml')):
                return True
            elif content_type == 'application/pdf':
                return True
            elif content_type.startswith(('image/', 'video/', 'audio/')):
                return False

        # Default to allowing extraction attempt
        return True

    def get_mime_type(self, file_name: str) -> str:
        """
        Get MIME type based on file name.

        Args:
            file_name: Name of the file

        Returns:
            MIME type string
        """
        import mimetypes

        # Initialize mimetypes
        mimetypes.init()

        # Get MIME type
        mime_type, _ = mimetypes.guess_type(file_name)

        if not mime_type:
            # Default mappings for common extensions
            extension_map = {
                'txt': 'text/plain',
                'md': 'text/markdown',
                'json': 'application/json',
                'csv': 'text/csv',
                'html': 'text/html',
                'htm': 'text/html',
                'pdf': 'application/pdf',
                'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
                'jpg': 'image/jpeg',
                'jpeg': 'image/jpeg',
                'png': 'image/png',
                'gif': 'image/gif',
                'svg': 'image/svg+xml',
                'webp': 'image/webp',
                'mp4': 'video/mp4',
                'mp3': 'audio/mpeg'
            }

            # Get extension
            ext = os.path.splitext(file_name)[1].lower().lstrip('.')

            # Return mapped MIME type or default
            mime_type = extension_map.get(ext, 'application/octet-stream')

        return mime_type


class LoggingUtils:
    """
    Enhanced logging utilities for better workflow tracking.
    Provides structured and categorized logging for workflows.
    """

    def __init__(self, workflow_id: str = None, log_func: Callable = None):
        """
        Initialize logging utilities.

        Args:
            workflow_id: ID of the workflow for context
            log_func: Function to call for adding workflow logs
        """
        self.workflow_id = workflow_id
        self.log_func = log_func
        self.logger = logging.getLogger(__name__)

        # Define log categories
        self.categories = {
            "workflow": "Workflow Management",
            "planning": "Activity Planning",
            "execution": "Activity Execution",
            "agents": "Agent Selection & Execution",
            "files": "File Processing",
            "summary": "Results Summary",
            "error": "Error Handling",
            "code": "Code Execution",
        }

    def set_workflow_id(self, workflow_id: str):
        """Update the workflow ID"""
        self.workflow_id = workflow_id

    def set_log_func(self, log_func: Callable):
        """Update the log function"""
        self.log_func = log_func

    def info(self, message: str, category: str = "workflow", details: str = None):
        """
        Log an informational message.

        Args:
            message: The log message
            category: Log category
            details: Optional detailed information
        """
        category_name = self.categories.get(category, category)
        log_message = f"[{category_name}] {message}"

        # Log to standard logger
        self.logger.info(log_message)

        # Log to workflow if function available
        if self.log_func and self.workflow_id:
            self.log_func(self.workflow_id, message, "info", category, category_name)

    def warning(self, message: str, category: str = "workflow", details: str = None):
        """
        Log a warning message.

        Args:
            message: The log message
            category: Log category
            details: Optional detailed information
        """
        category_name = self.categories.get(category, category)
        log_message = f"[{category_name}] {message}"

        # Log to standard logger
        self.logger.warning(log_message)

        # Log to workflow if function available
        if self.log_func and self.workflow_id:
            self.log_func(self.workflow_id, message, "warning", category, category_name)

    def error(self, message: str, category: str = "error", details: str = None):
        """
        Log an error message.

        Args:
            message: The log message
            category: Log category
            details: Optional detailed information
        """
        category_name = self.categories.get(category, category)
        log_message = f"[{category_name}] {message}"

        # Log to standard logger
        self.logger.error(log_message)

        # Log to workflow if function available
        if self.log_func and self.workflow_id:
            self.log_func(self.workflow_id, message, "error", category, category_name)

    def debug(self, message: str, category: str = "workflow", details: str = None):
        """
        Log a debug message.

        Args:
            message: The log message
            category: Log category
            details: Optional detailed information
        """
        category_name = self.categories.get(category, category)
        log_message = f"[{category_name}] {message}"

        # Log to standard logger
        self.logger.debug(log_message)

    def get_category_name(self, category: str) -> str:
        """
        Get human-readable category name.

        Args:
            category: Category code

        Returns:
            Human-readable category name
        """
        return self.categories.get(category, category)


def extract_text_from_file_content(file_content: bytes, file_name: str, content_type: str = None) -> Tuple[str, bool]:
    """
    Extract text from various file formats based on binary content.

    Args:
        file_content: Binary content of the file
        file_name: Name of the file for format detection
        content_type: Optional MIME type of the file

    Returns:
        Tuple with (extracted text, is_extracted flag)
    """
    # Check if file is likely text-extractable
    if not is_text_extractable(file_name, content_type):
        return f"[File: {file_name} - Text extraction not supported]", False

    try:
        # Simple text files
        if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv', '.log', '.ini', '.cfg', '.conf')) or (content_type and (content_type.startswith('text/') or content_type in ['application/json', 'application/xml', 'text/csv'])):
            try:
                return file_content.decode('utf-8'), True
            except UnicodeDecodeError:
                try:
                    return file_content.decode('latin1'), True
                except:
                    return file_content.decode('cp1252', errors='replace'), True

        # Excel files
        elif file_name.endswith(('.xlsx', '.xls')):
            try:
                import pandas as pd
                # Create temporary in-memory file
                file_obj = BytesIO(file_content)
                df = pd.read_excel(file_obj)
                result = f"Excel file with {len(df)} rows and {len(df.columns)} columns.\n"
                result += f"Columns: {', '.join(df.columns.tolist())}\n\n"
                result += df.to_string(index=False)
                return result, True
            except ImportError:
                return f"[Excel file: {file_name} - pandas not installed]", False
            except Exception as e:
                return f"[Error extracting Excel content: {str(e)}]", False

        # CSV files
        elif file_name.endswith('.csv'):
            try:
                import pandas as pd
                try:
                    # Create temporary in-memory file
                    file_obj = BytesIO(file_content)
                    df = pd.read_csv(file_obj, encoding='utf-8')
                except UnicodeDecodeError:
                    file_obj = BytesIO(file_content)
                    try:
                        df = pd.read_csv(file_obj, encoding='latin1')
                    except:
                        file_obj = BytesIO(file_content)
                        df = pd.read_csv(file_obj, encoding='cp1252')

                result = f"CSV file with {len(df)} rows and {len(df.columns)} columns.\n"
                result += f"Columns: {', '.join(df.columns.tolist())}\n\n"
                result += df.to_string(index=False)
                return result, True
            except ImportError:
                return f"[CSV file: {file_name} - pandas not installed]", False
            except Exception as e:
                return f"[Error extracting CSV content: {str(e)}]", False

        # PDF files
        elif file_name.endswith('.pdf'):
            try:
                try:
                    from PyPDF2 import PdfReader
                    reader = PdfReader(BytesIO(file_content))
                    text = ""
                    for page in reader.pages:
                        text += page.extract_text() + "\n\n"
                    return text, True
                except ImportError:
                    try:
                        import fitz  # PyMuPDF
                        doc = fitz.open(stream=file_content, filetype="pdf")
                        text = ""
                        for page in doc:
                            text += page.get_text() + "\n\n"
                        return text, True
                    except ImportError:
                        return f"[PDF: {file_name} - No PDF library installed]", False
            except Exception as e:
                return f"[Error reading PDF file {file_name}: {str(e)}]", False

        # Default case - try basic text extraction
        else:
            try:
                return file_content.decode('utf-8', errors='replace'), True
            except Exception as e:
                logger.error(f"Error extracting text from {file_name}: {str(e)}")
                return f"[Text extraction error: {str(e)}]", False

    except Exception as e:
        logger.error(f"Error extracting text from {file_name}: {str(e)}")
        return f"[Text extraction error: {str(e)}]", False


def is_text_extractable(file_name: str, content_type: str = None) -> bool:
    """Check if text can be extracted from a file."""
    # Text files
    if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py', '.csv')):
        return True

    # Excel files
    if file_name.endswith(('.xlsx', '.xls')):
        try:
            import pandas
            return True
        except ImportError:
            return False

    # PDF files
    if file_name.endswith('.pdf'):
        try:
            # Check if PyPDF2 or PyMuPDF is available
            try:
                import PyPDF2
                return True
            except ImportError:
                try:
                    import fitz  # PyMuPDF
                    return True
                except ImportError:
                    return False
        except:
            return False

    # Images and other non-text files
    if file_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg',
                          '.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv',
                          '.mp3', '.wav', '.ogg', '.flac', '.aac')):
        return False

    # Check content type if file extension doesn't give a clear answer
    if content_type:
        if content_type.startswith(('text/', 'application/json', 'application/xml')):
            return True
        elif content_type == 'application/pdf':
            return True
        elif content_type.startswith(('image/', 'video/', 'audio/')):
            return False

    # Default to allowing extraction attempt
    return True