gateway/gwserver/modules/agentservice_document_handler.py

"""
Enhanced document handling module for the Agentservice (continued).
"""

import os
import logging
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union

logger = logging.getLogger(__name__)

class DocumentHandler:
    """
    Centralized document handler for consistent document management across the system.
    """

    def __init__(self, workflow_id: str = None, lucydom_interface = None, ai_service = None):
        """Initialize the document handler."""
        self.workflow_id = workflow_id
        self.lucydom_interface = lucydom_interface
        self.ai_service = ai_service

        # Import necessary utilities
        from modules.agentservice_filemanager import get_file_manager
        self.file_manager = get_file_manager()

    def set_workflow_id(self, workflow_id: str):
        """Set or update the workflow ID."""
        self.workflow_id = workflow_id

    def set_lucydom_interface(self, lucydom_interface):
        """Set or update the LucyDOM interface."""
        self.lucydom_interface = lucydom_interface

    def set_ai_service(self, ai_service):
        """Set or update the AI service."""
        self.ai_service = ai_service

    async def add_file_to_message(self, message: Dict[str, Any], file_id: int, extraction_prompt: str = None) -> Dict[str, Any]:
        """
        Add a file to a message with optional contextual extraction.

        Args:
            message: The message to add the file to
            file_id: ID of the file to add
            extraction_prompt: Optional prompt for contextual extraction (e.g., for images)

        Returns:
            Updated message with the file added
        """
        if not self.lucydom_interface:
            logger.error("LucyDOM interface not available")
            return message

        try:
            # Get file metadata
            file = self.lucydom_interface.get_file(file_id)
            if not file:
                logger.warning(f"File with ID {file_id} not found")
                return message

            # Get necessary file information
            file_name = file.get("name", "unnamed_file")
            file_type = file.get("type", "unknown")
            content_type = file.get("content_type")

            # Initialize documents array if needed
            if "documents" not in message:
                message["documents"] = []

            # Check if file is already in the message
            file_already_added = any(
                doc.get("source", {}).get("id") == str(file_id)
                for doc in message.get("documents", [])
            )

            if file_already_added:
                logger.info(f"File {file_name} already exists in message, skipping")
                return message

            # Create a unique document ID
            doc_id = f"doc_{uuid.uuid4()}"

            # Create document structure
            document = {
                "id": doc_id,
                "source": {
                    "type": "file",
                    "id": str(file_id),
                    "name": file_name,
                    "content_type": content_type,
                    "size": file.get("size"),
                    "upload_date": file.get("upload_date", datetime.now().isoformat())
                },
                "contents": []
            }

            # Only read content if we have extraction prompt or specific types
            if (extraction_prompt or
                file_type in ["document", "text"] or
                (content_type and content_type.startswith("text/"))):

                # Read file content
                file_content = await self.lucydom_interface.read_file_content(file_id)

                if file_content:
                    # Process based on file type
                    if file_type == "image" or (content_type and content_type.startswith("image/")):
                        # Image analysis if prompt provided
                        if extraction_prompt and self.ai_service and hasattr(self.ai_service, "analyze_image"):
                            try:
                                image_analysis = await self.ai_service.analyze_image(
                                    image_data=file_content,
                                    prompt=extraction_prompt or "Describe this image in detail",
                                    mime_type=content_type
                                )

                                # Add the analysis as text content
                                document["contents"].append({
                                    "type": "text",
                                    "text": f"Image Analysis:\n{image_analysis}",
                                    "is_extracted": True,
                                    "extraction_context": extraction_prompt
                                })

                                logger.info(f"Added image analysis for {file_name} to message")
                            except Exception as e:
                                logger.error(f"Error analyzing image {file_name}: {str(e)}")
                                document["contents"].append({
                                    "type": "text",
                                    "text": f"Image file: {file_name} (Analysis failed: {str(e)})",
                                    "is_extracted": False
                                })
                        else:
                            # Just add placeholder if no analysis available
                            document["contents"].append({
                                "type": "text",
                                "text": f"Image file: {file_name} (no analysis requested)",
                                "is_extracted": False
                            })
                    else:
                        # For other file types, extract text
                        from modules.agentservice_utils import extract_text_from_file_content

                        content, is_extracted = extract_text_from_file_content(
                            file_content, file_name, content_type
                        )

                        document["contents"].append({
                            "type": "text",
                            "text": content,
                            "is_extracted": is_extracted,
                            "extraction_context": extraction_prompt
                        })

                        logger.info(f"Added text content for {file_name} to message (extracted: {is_extracted})")
                else:
                    # No content available
                    document["contents"].append({
                        "type": "text",
                        "text": f"File content not available for {file_name}",
                        "is_extracted": False
                    })
            else:
                # Just add reference without content
                document["contents"].append({
                    "type": "text",
                    "text": f"File: {file_name} (content not loaded)",
                    "is_extracted": False
                })

            # Add document to message
            message["documents"].append(document)

            logger.info(f"File {file_name} successfully added to message")
            return message

        except Exception as e:
            logger.error(f"Error adding file {file_id} to message: {str(e)}")
            return message

    async def add_files_to_message(self, message: Dict[str, Any], file_ids: List[int], extraction_prompt: str = None) -> Dict[str, Any]:
        """
        Add multiple files to a message.

        Args:
            message: The message to add files to
            file_ids: List of file IDs to add
            extraction_prompt: Optional prompt for contextual extraction

        Returns:
            Updated message with files added
        """
        updated_message = message.copy()

        for file_id in file_ids:
            updated_message = await self.add_file_to_message(updated_message, file_id, extraction_prompt)

        return updated_message

    async def extract_document_content(self, doc_id: str, message: Dict[str, Any], extraction_prompt: str) -> Dict[str, Any]:
        """
        Extract or update document content with contextual extraction.

        Args:
            doc_id: ID of the document to extract
            message: Message containing the document
            extraction_prompt: Contextual prompt for extraction

        Returns:
            Updated message with extracted content
        """
        if not message or "documents" not in message:
            return message

        updated_message = message.copy()

        # Find the document
        for i, document in enumerate(updated_message.get("documents", [])):
            if document.get("id") == doc_id:
                # Get file ID from source
                source = document.get("source", {})
                file_id = source.get("id")

                if file_id and self.lucydom_interface:
                    # Get file metadata
                    file = self.lucydom_interface.get_file(int(file_id))
                    if not file:
                        continue

                    # Get file content
                    file_content = await self.lucydom_interface.read_file_content(int(file_id))
                    if not file_content:
                        continue

                    # Process based on file type
                    file_name = file.get("name", "unnamed_file")
                    file_type = file.get("type", "unknown")
                    content_type = file.get("content_type")

                    # Update content based on file type
                    if file_type == "image" or (content_type and content_type.startswith("image/")):
                        if self.ai_service and hasattr(self.ai_service, "analyze_image"):
                            try:
                                image_analysis = await self.ai_service.analyze_image(
                                    image_data=file_content,
                                    prompt=extraction_prompt,
                                    mime_type=content_type
                                )

                                # Create or update content
                                new_content = {
                                    "type": "text",
                                    "text": f"Image Analysis:\n{image_analysis}",
                                    "is_extracted": True,
                                    "extraction_context": extraction_prompt
                                }

                                # Update or add content
                                contents = document.get("contents", [])
                                contents_updated = False

                                for j, content in enumerate(contents):
                                    if content.get("type") == "text":
                                        updated_message["documents"][i]["contents"][j] = new_content
                                        contents_updated = True
                                        break

                                if not contents_updated:
                                    if not updated_message["documents"][i].get("contents"):
                                        updated_message["documents"][i]["contents"] = []
                                    updated_message["documents"][i]["contents"].append(new_content)

                                logger.info(f"Updated image analysis for {file_name} with new context: {extraction_prompt}")
                            except Exception as e:
                                logger.error(f"Error updating image analysis for {file_name}: {str(e)}")
                    else:
                        # For other file types, extract text with new context
                        from modules.agentservice_utils import extract_text_from_file_content

                        content, is_extracted = extract_text_from_file_content(
                            file_content, file_name, content_type
                        )

                        new_content = {
                            "type": "text",
                            "text": content,
                            "is_extracted": is_extracted,
                            "extraction_context": extraction_prompt
                        }

                        # Update or add content
                        contents = document.get("contents", [])
                        contents_updated = False

                        for j, content_item in enumerate(contents):
                            if content_item.get("type") == "text":
                                updated_message["documents"][i]["contents"][j] = new_content
                                contents_updated = True
                                break

                        if not contents_updated:
                            if not updated_message["documents"][i].get("contents"):
                                updated_message["documents"][i]["contents"] = []
                            updated_message["documents"][i]["contents"].append(new_content)

                        logger.info(f"Updated text extraction for {file_name} with new context: {extraction_prompt}")

                # Found and processed the document, stop searching
                break

        return updated_message

    async def extract_files_from_workflow(self, workflow: Dict[str, Any], extraction_prompt: str, file_filter: str = None) -> Dict[str, Any]:
        """
        Extract all relevant files from a workflow with context-aware extraction.

        Args:
            workflow: The workflow object
            extraction_prompt: Contextual prompt for extraction
            file_filter: Optional filter for file types (e.g., "csv", "image")

        Returns:
            Dictionary with extracted content
        """
        # Import for data extraction
        from modules.agentservice_dataextraction import data_extraction

        # Get all files from the workflow
        files = []

        # Process all messages
        for message in workflow.get("messages", []):
            # Extract documents from the message
            for doc in message.get("documents", []):
                source = doc.get("source", {})

                # Only include file documents
                if source.get("type") == "file":
                    file_info = {
                        "id": source.get("id", ""),
                        "name": source.get("name", ""),
                        "type": source.get("type", ""),
                        "content_type": source.get("content_type", ""),
                        "size": source.get("size", 0)
                    }

                    # Apply filter if provided
                    if file_filter:
                        file_name = file_info.get("name", "").lower()
                        content_type = file_info.get("content_type", "").lower()

                        if (file_filter.lower() in file_name or
                            file_filter.lower() in content_type):
                            # Check if file is already in the list
                            if not any(f.get("id") == file_info["id"] for f in files):
                                files.append(file_info)
                    else:
                        # No filter, include all files
                        if not any(f.get("id") == file_info["id"] for f in files):
                            files.append(file_info)

        # If no files found, return empty result
        if not files:
            return {
                "prompt": extraction_prompt,
                "files_processed": 0,
                "extracted_content": []
            }

        # Get all messages from the workflow
        workflow_messages = workflow.get("messages", [])

        # Extract data using the dataextraction module
        extracted_data = await data_extraction(
            prompt=extraction_prompt,
            files=files,
            messages=workflow_messages,
            ai_service=self.ai_service,
            lucydom_interface=self.lucydom_interface,
            workflow_id=self.workflow_id,
            add_log_func=None  # We don't have access to add_log_func here
        )

        return extracted_data

    def get_file_content_from_message(self, message: Dict[str, Any], file_id: int = None, doc_id: str = None) -> str:
        """
        Get file content from a message.

        Args:
            message: The message containing the document
            file_id: Optional file ID to search for
            doc_id: Optional document ID to search for

        Returns:
            Text content of the file if available
        """
        if not message or "documents" not in message:
            return ""

        # Search for the document
        for document in message.get("documents", []):
            # Match by document ID or file ID
            source = document.get("source", {})
            source_file_id = source.get("id")

            if ((doc_id and document.get("id") == doc_id) or
                (file_id and source_file_id and str(file_id) == str(source_file_id))):

                # Get text content from document
                for content in document.get("contents", []):
                    if content.get("type") == "text":
                        return content.get("text", "")

        return ""

    def create_text_document(self, message: Dict[str, Any], content: str, title: str = "Generated Text") -> Dict[str, Any]:
        """
        Create a new text document in a message.

        Args:
            message: The message to add the document to
            content: Text content
            title: Document title

        Returns:
            Updated message with the new document
        """
        # Initialize documents array if needed
        updated_message = message.copy()
        if "documents" not in updated_message:
            updated_message["documents"] = []

        # Create document ID
        doc_id = f"doc_{uuid.uuid4()}"

        # Create document structure
        document = {
            "id": doc_id,
            "source": {
                "type": "generated",
                "id": doc_id,
                "name": title,
                "content_type": "text/plain",
                "size": len(content)
            },
            "contents": [
                {
                    "type": "text",
                    "text": content,
                    "is_extracted": True
                }
            ]
        }

        # Add document to message
        updated_message["documents"].append(document)

        logger.info(f"Created text document '{title}' in message")
        return updated_message

    def merge_document_contents(self, message: Dict[str, Any]) -> str:
        """
        Merge all document contents from a message into a single text.

        Args:
            message: The message containing documents

        Returns:
            Combined text content from all documents
        """
        if not message or "documents" not in message:
            return ""

        combined_text = ""

        for document in message.get("documents", []):
            source = document.get("source", {})
            doc_name = source.get("name", "Unnamed Document")

            # Extract text content
            doc_text = ""
            for content in document.get("contents", []):
                if content.get("type") == "text":
                    doc_text = content.get("text", "")
                    break

            if doc_text:
                combined_text += f"\n\n--- {doc_name} ---\n\n{doc_text}"

        return combined_text.strip()

# Factory function
def get_document_handler(workflow_id: str = None, lucydom_interface = None, ai_service = None) -> DocumentHandler:
    """Get a document handler instance."""
    return DocumentHandler(workflow_id, lucydom_interface, ai_service)