gateway/modules/workflow/documentManager.py

"""
Document Manager Module for handling document operations and content extraction.
"""

import logging
from typing import Dict, Any, List, Optional
from datetime import datetime
from modules.interfaces.serviceChatModel import ChatDocument, ChatContent
from modules.workflow.documentProcessor import getDocumentContents
import uuid
import json
import base64

logger = logging.getLogger(__name__)

class DocumentManager:
    """Manager for document operations and content extraction."""

    _instance = None

    @classmethod
    def getInstance(cls):
        """Return a singleton instance of the document manager."""
        if cls._instance is None:
            cls._instance = cls()
        return cls._instance

    def __init__(self):
        """Initialize the document manager."""
        if DocumentManager._instance is not None:
            raise RuntimeError("Singleton instance already exists - use getInstance()")

        self.service = None

    def initialize(self, service=None):
        """Initialize or update the manager with service references."""
        if service:
            # Validate required interfaces
            required_interfaces = ['base', 'msft', 'google']
            missing_interfaces = []
            for interface in required_interfaces:
                if not hasattr(service, interface):
                    missing_interfaces.append(interface)

            if missing_interfaces:
                logger.warning(f"Service container missing required interfaces: {', '.join(missing_interfaces)}")
                return False

        self.service = service
        return True

    async def extractContent(self, fileId: str) -> Optional[ChatDocument]:
        """
        Extract content from a file.

        Args:
            fileId: ID of the file to extract content from

        Returns:
            ChatDocument object if successful, None otherwise
        """
        try:
            # Get file content
            fileContent = await self.getFileContent(fileId)
            if not fileContent:
                return None

            # Get file metadata
            fileMetadata = await self.getFileMetadata(fileId)
            if not fileMetadata:
                return None

            # Create ChatDocument
            return ChatDocument(
                id=str(uuid.uuid4()),
                fileId=fileId,
                filename=fileMetadata.get("name", "Unknown"),
                fileSize=fileMetadata.get("size", 0),
                content=fileContent.decode('utf-8', errors='ignore'),
                mimeType=fileMetadata.get("mimeType", "text/plain")
            )
        except Exception as e:
            logger.error(f"Error extracting content from file {fileId}: {str(e)}")
            return None

    async def getFileContent(self, fileId: str) -> Optional[bytes]:
        """Gets the content of a file."""
        try:
            return self.service.functions.getFileData(fileId)
        except Exception as e:
            logger.error(f"Error getting file content for {fileId}: {str(e)}")
            return None

    async def getFileMetadata(self, fileId: str) -> Optional[Dict[str, Any]]:
        """Gets the metadata of a file."""
        try:
            return self.service.functions.getFile(fileId)
        except Exception as e:
            logger.error(f"Error getting file metadata for {fileId}: {str(e)}")
            return None

    async def saveFile(self, filename: str, content: bytes, mimeType: str) -> Optional[int]:
        """
        Save a new file.

        Args:
            filename: Name of the file
            content: File content as bytes
            mimeType: MIME type of the file

        Returns:
            File ID if successful, None otherwise
        """
        try:
            return await self.service.base.saveFile(filename, content, mimeType)
        except Exception as e:
            logger.error(f"Error saving file {filename}: {str(e)}")
            return None

    async def deleteFile(self, fileId: str) -> bool:
        """Deletes a file."""
        try:
            return self.service.functions.deleteFile(fileId)
        except Exception as e:
            logger.error(f"Error deleting file {fileId}: {str(e)}")
            return False

    async def convertFileRefToId(self, ref: str) -> Optional[int]:
        """
        Convert agent file reference to file ID.

        Args:
            ref: File reference in format 'filename;id' or just 'id'

        Returns:
            File ID if successful, None otherwise
        """
        try:
            # Extract file ID from reference format
            if isinstance(ref, str) and ';' in ref:
                return int(ref.split(';')[1])
            return int(ref)
        except Exception as e:
            logger.error(f"Error converting file reference to ID: {str(e)}")
            return None

    async def convertFileIdToRef(self, fileId: str) -> Optional[str]:
        """
        Convert file ID to agent file reference.

        Args:
            fileId: File ID to convert

        Returns:
            File reference in format 'filename;id' if successful, None otherwise
        """
        try:
            file = await self.getFileMetadata(fileId)
            if not file:
                return None
            return f"{file['name']};{fileId}"
        except Exception as e:
            logger.error(f"Error converting file ID to reference: {str(e)}")
            return None

    async def convertDataFormat(self, data: Any, format: str) -> Any:
        """
        Convert data between different formats.

        Args:
            data: Data to convert
            format: Target format ('json', 'base64', etc.)

        Returns:
            Converted data
        """
        try:
            if format == 'json':
                if isinstance(data, str):
                    return json.loads(data)
                return json.dumps(data)
            elif format == 'base64':
                if isinstance(data, str):
                    return base64.b64encode(data.encode('utf-8')).decode('utf-8')
                return base64.b64encode(data).decode('utf-8')
            return data
        except Exception as e:
            logger.error(f"Error converting data format: {str(e)}")
            return data

    async def createAgentInputFileList(self, files: List[str]) -> List[Dict[str, Any]]:
        """
        Create a list of input files for agent processing.

        Args:
            files: List of file references

        Returns:
            List of file objects with content
        """
        try:
            inputFiles = []
            for file in files:
                fileId = await self.convertFileRefToId(file)
                if fileId:
                    fileData = await self.getFileMetadata(fileId)
                    if fileData:
                        content = await self.getFileContent(fileId)
                        inputFiles.append({
                            'id': fileId,
                            'name': fileData['name'],
                            'mimeType': fileData['mimeType'],
                            'content': content
                        })
            return inputFiles
        except Exception as e:
            logger.error(f"Error creating agent input file list: {str(e)}")
            return []

    async def saveAgentOutputFiles(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Save output files from agent processing.

        Args:
            files: List of file objects with content

        Returns:
            List of saved file metadata
        """
        try:
            savedFiles = []
            for file in files:
                # Create file metadata
                fileMeta = await self.saveFile(
                    filename=file['name'],
                    content=file['content'],
                    mimeType=file.get('mimeType', 'application/octet-stream')
                )

                if fileMeta:
                    savedFiles.append({
                        'id': fileMeta,
                        'name': file['name'],
                        'mimeType': file.get('mimeType', 'application/octet-stream')
                    })
            return savedFiles
        except Exception as e:
            logger.error(f"Error saving agent output files: {str(e)}")
            return []

    async def contentWithPrompt(self, document: Dict[str, Any], prompt: str) -> Optional[Dict[str, Any]]:
        """
        Extract content from a document using AI with a specific prompt.
        Handles large files by processing in chunks and merging results.

        Args:
            document: Document object with file information
            prompt: Specific prompt for content extraction

        Returns:
            Dictionary with extracted content and metadata
        """
        try:
            # First get the document content
            chat_doc = await self.extractContent(document.get('id'))
            if not chat_doc:
                return None

            # Prepare the content for AI processing
            content = chat_doc.content
            mime_type = chat_doc.mimeType

            # For large files, process in chunks
            if len(content) > 100000:  # Arbitrary threshold, adjust as needed
                chunks = self._splitContentIntoChunks(content, mime_type)
                extracted_chunks = []

                for chunk in chunks:
                    # Process each chunk with AI
                    chunk_result = await self._processContentChunk(chunk, prompt)
                    if chunk_result:
                        extracted_chunks.append(chunk_result)

                # Merge results
                return {
                    "content": self._mergeChunkResults(extracted_chunks),
                    "metadata": {
                        "original_size": len(content),
                        "chunks_processed": len(chunks),
                        "mime_type": mime_type
                    }
                }
            else:
                # Process single chunk
                result = await self._processContentChunk(content, prompt)
                return {
                    "content": result,
                    "metadata": {
                        "original_size": len(content),
                        "chunks_processed": 1,
                        "mime_type": mime_type
                    }
                }

        except Exception as e:
            logger.error(f"Error in contentWithPrompt: {str(e)}")
            return None

    def _splitContentIntoChunks(self, content: str, mime_type: str) -> List[str]:
        """
        Split content into manageable chunks based on mime type.

        Args:
            content: Content to split
            mime_type: MIME type of the content

        Returns:
            List of content chunks
        """
        try:
            if mime_type.startswith('text/'):
                # Split text content by paragraphs or sections
                return [chunk.strip() for chunk in content.split('\n\n') if chunk.strip()]
            elif mime_type == 'application/json':
                # Split JSON content by objects
                data = json.loads(content)
                if isinstance(data, list):
                    return [json.dumps(item) for item in data]
                return [content]
            else:
                # Default chunking
                return [content[i:i+10000] for i in range(0, len(content), 10000)]
        except Exception as e:
            logger.error(f"Error splitting content: {str(e)}")
            return [content]

    async def _processContentChunk(self, chunk: str, prompt: str) -> Optional[str]:
        """
        Process a single content chunk with AI.

        Args:
            chunk: Content chunk to process
            prompt: Extraction prompt

        Returns:
            Processed content
        """
        try:
            # Create AI prompt
            ai_prompt = f"""
            Extract relevant information from this content based on the following prompt:

            PROMPT: {prompt}

            CONTENT:
            {chunk}

            Return ONLY the extracted information in a clear, concise format.
            """

            # Get AI response
            response = await self.service.base.callAi([
                {"role": "system", "content": "You are an expert at extracting relevant information from documents."},
                {"role": "user", "content": ai_prompt}
            ])

            return response.strip()

        except Exception as e:
            logger.error(f"Error processing content chunk: {str(e)}")
            return None

    def _mergeChunkResults(self, chunks: List[str]) -> str:
        """
        Merge processed content chunks into a single result.

        Args:
            chunks: List of processed chunks

        Returns:
            Merged content
        """
        try:
            # Remove duplicates and empty chunks
            chunks = [chunk for chunk in chunks if chunk and chunk.strip()]

            # Merge chunks with appropriate spacing
            return "\n\n".join(chunks)

        except Exception as e:
            logger.error(f"Error merging chunk results: {str(e)}")
            return ""

# Singleton factory for the document manager
def getDocumentManager():
    return DocumentManager.getInstance()