gateway/modules/workflow/managerDocument.py

from typing import Dict, Any, Optional, List
import logging
import json
import os
from datetime import datetime, UTC
from pathlib import Path
import mimetypes
import hashlib
import shutil
import uuid
import base64

from modules.workflow.processorDocument import DocumentProcessor
from modules.shared.configuration import APP_CONFIG
from modules.interfaces.serviceChatModel import ChatDocument, ChatContent

logger = logging.getLogger(__name__)

class DocumentManager:
    """Document manager with enhanced operations and file handling"""

    _instance = None

    @classmethod
    def getInstance(cls):
        """Return a singleton instance of the document manager."""
        if cls._instance is None:
            cls._instance = cls()
        return cls._instance

    def __init__(self):
        """Initialize document manager"""
        if DocumentManager._instance is not None:
            raise RuntimeError("Singleton instance already exists - use getInstance()")

        self.processor = DocumentProcessor()
        self.document_cache = {}
        self.temp_dir = Path(APP_CONFIG.get('temp_dir', 'temp'))
        self.output_dir = Path(APP_CONFIG.get('output_dir', 'output'))
        self.service = None

    async def initialize(self, context: Dict[str, Any], service=None) -> None:
        """Initialize document manager with context and service"""
        # Initialize processor
        self.processor.initialize(context)

        # Initialize service container
        if service:
            # Validate required interfaces
            required_interfaces = ['base', 'msft', 'google']
            missing_interfaces = []
            for interface in required_interfaces:
                if not hasattr(service, interface):
                    missing_interfaces.append(interface)

            if missing_interfaces:
                logger.warning(f"Service container missing required interfaces: {', '.join(missing_interfaces)}")
                return False

            self.service = service

        # Create directories if they don't exist
        self.temp_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir.mkdir(parents=True, exist_ok=True)

        # Clear temporary directory
        self._clear_temp_directory()

    def _clear_temp_directory(self) -> None:
        """Clear temporary directory"""
        try:
            if self.temp_dir.exists():
                shutil.rmtree(self.temp_dir)
            self.temp_dir.mkdir(parents=True)
        except Exception as e:
            logger.error(f"Error clearing temp directory: {str(e)}")

    async def process_document(self, document: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
        """Process a document with context"""
        try:
            # Generate document ID if not present
            if 'id' not in document:
                document['id'] = self._generate_document_id(document)

            # Process document content
            processed = await self.processor.process_with_context(document, context)

            # Add metadata
            processed['metadata'] = {
                'processedAt': datetime.now(UTC).isoformat(),
                'processor': 'DocumentManager',
                'version': '1.0'
            }

            # Cache document
            self.document_cache[document['id']] = processed

            return processed

        except Exception as e:
            logger.error(f"Error processing document: {str(e)}")
            return {
                'id': document.get('id', ''),
                'error': str(e),
                'status': 'error'
            }

    async def extract_content(self, file_id: str) -> Optional[ChatDocument]:
        """Extract content from a file"""
        try:
            # Get file content
            file_content = await self.get_file_content(file_id)
            if not file_content:
                return None

            # Get file metadata
            file_metadata = await self.get_file_metadata(file_id)
            if not file_metadata:
                return None

            # Create ChatDocument
            return ChatDocument(
                id=str(uuid.uuid4()),
                fileId=file_id,
                filename=file_metadata.get("name", "Unknown"),
                fileSize=file_metadata.get("size", 0),
                content=file_content.decode('utf-8', errors='ignore'),
                mimeType=file_metadata.get("mimeType", "text/plain")
            )
        except Exception as e:
            logger.error(f"Error extracting content from file {file_id}: {str(e)}")
            return None

    async def get_file_content(self, file_id: str) -> Optional[bytes]:
        """Get file content"""
        try:
            if not self.service or not self.service.functions:
                logger.error("Service or functions not initialized")
                return None
            return self.service.functions.getFileData(file_id)
        except Exception as e:
            logger.error(f"Error getting file content for {file_id}: {str(e)}")
            return None

    async def get_file_metadata(self, file_id: str) -> Optional[Dict[str, Any]]:
        """Get file metadata"""
        try:
            if not self.service or not self.service.functions:
                logger.error("Service or functions not initialized")
                return None
            return self.service.functions.getFile(file_id)
        except Exception as e:
            logger.error(f"Error getting file metadata for {file_id}: {str(e)}")
            return None

    async def save_file(self, filename: str, content: bytes, mime_type: str) -> Optional[int]:
        """Save a new file"""
        try:
            if not self.service or not self.service.base:
                logger.error("Service or base interface not initialized")
                return None
            return await self.service.base.saveFile(filename, content, mime_type)
        except Exception as e:
            logger.error(f"Error saving file {filename}: {str(e)}")
            return None

    async def delete_file(self, file_id: str) -> bool:
        """Delete a file"""
        try:
            if not self.service or not self.service.functions:
                logger.error("Service or functions not initialized")
                return False
            return self.service.functions.deleteFile(file_id)
        except Exception as e:
            logger.error(f"Error deleting file {file_id}: {str(e)}")
            return False

    def convert_file_ref_to_id(self, ref: str) -> Optional[int]:
        """Convert file reference to ID"""
        try:
            if isinstance(ref, str) and ';' in ref:
                return int(ref.split(';')[1])
            return int(ref)
        except Exception as e:
            logger.error(f"Error converting file reference to ID: {str(e)}")
            return None

    def convert_file_id_to_ref(self, file_id: str) -> Optional[str]:
        """Convert file ID to reference"""
        try:
            if not self.service or not self.service.functions:
                logger.error("Service or functions not initialized")
                return None

            file = self.service.functions.getFile(file_id)
            if not file:
                return None
            return f"{file.filename};{file_id}"
        except Exception as e:
            logger.error(f"Error converting file ID to reference: {str(e)}")
            return None

    async def convert_data_format(self, data: Any, format: str) -> Any:
        """Convert data between formats"""
        try:
            if format == 'json':
                if isinstance(data, str):
                    return json.loads(data)
                return json.dumps(data)
            elif format == 'base64':
                if isinstance(data, str):
                    return base64.b64encode(data.encode('utf-8')).decode('utf-8')
                return base64.b64encode(data).decode('utf-8')
            return data
        except Exception as e:
            logger.error(f"Error converting data format: {str(e)}")
            return data

    async def create_agent_input_file_list(self, files: List[str]) -> List[Dict[str, Any]]:
        """Create list of input files for agent processing"""
        try:
            input_files = []
            for file in files:
                file_id = await self.convert_file_ref_to_id(file)
                if file_id:
                    file_data = await self.get_file_metadata(file_id)
                    if file_data:
                        content = await self.get_file_content(file_id)
                        input_files.append({
                            'id': file_id,
                            'name': file_data['name'],
                            'mimeType': file_data['mimeType'],
                            'content': content
                        })
            return input_files
        except Exception as e:
            logger.error(f"Error creating agent input file list: {str(e)}")
            return []

    async def save_agent_output_files(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Save output files from agent processing"""
        try:
            saved_files = []
            for file in files:
                file_meta = await self.save_file(
                    filename=file['name'],
                    content=file['content'],
                    mimeType=file.get('mimeType', 'application/octet-stream')
                )

                if file_meta:
                    saved_files.append({
                        'id': file_meta,
                        'name': file['name'],
                        'mimeType': file.get('mimeType', 'application/octet-stream')
                    })
            return saved_files
        except Exception as e:
            logger.error(f"Error saving agent output files: {str(e)}")
            return []

    async def content_with_prompt(self, document: Dict[str, Any], prompt: str) -> Optional[Dict[str, Any]]:
        """Extract content using AI with specific prompt"""
        try:
            # Get document content
            chat_doc = await self.extract_content(document.get('id'))
            if not chat_doc:
                return None

            # Prepare content
            content = chat_doc.content
            mime_type = chat_doc.mimeType

            # Process large files in chunks
            if len(content) > 100000:
                chunks = self._split_content_into_chunks(content, mime_type)
                extracted_chunks = []

                for chunk in chunks:
                    chunk_result = await self._process_content_chunk(chunk, prompt)
                    if chunk_result:
                        extracted_chunks.append(chunk_result)

                return {
                    "content": self._merge_chunk_results(extracted_chunks),
                    "metadata": {
                        "original_size": len(content),
                        "chunks_processed": len(chunks),
                        "mime_type": mime_type
                    }
                }
            else:
                result = await self._process_content_chunk(content, prompt)
                return {
                    "content": result,
                    "metadata": {
                        "original_size": len(content),
                        "chunks_processed": 1,
                        "mime_type": mime_type
                    }
                }

        except Exception as e:
            logger.error(f"Error in content_with_prompt: {str(e)}")
            return None

    def _split_content_into_chunks(self, content: str, mime_type: str) -> List[str]:
        """Split content into manageable chunks"""
        try:
            if mime_type.startswith('text/'):
                return [chunk.strip() for chunk in content.split('\n\n') if chunk.strip()]
            elif mime_type == 'application/json':
                data = json.loads(content)
                if isinstance(data, list):
                    return [json.dumps(item) for item in data]
                return [content]
            else:
                return [content[i:i+10000] for i in range(0, len(content), 10000)]
        except Exception as e:
            logger.error(f"Error splitting content: {str(e)}")
            return [content]

    async def _process_content_chunk(self, chunk: str, prompt: str) -> Optional[str]:
        """Process content chunk with AI"""
        try:
            if not self.service or not self.service.base:
                logger.error("Service or base interface not initialized")
                return None

            ai_prompt = f"""
            Extract relevant information from this content based on the following prompt:

            PROMPT: {prompt}

            CONTENT:
            {chunk}

            Return ONLY the extracted information in a clear, concise format.
            """

            response = await self.service.base.callAi([
                {"role": "system", "content": "You are an expert at extracting relevant information from documents."},
                {"role": "user", "content": ai_prompt}
            ])

            return response.strip()

        except Exception as e:
            logger.error(f"Error processing content chunk: {str(e)}")
            return None

    def _merge_chunk_results(self, chunks: List[str]) -> str:
        """Merge processed content chunks"""
        try:
            chunks = [chunk for chunk in chunks if chunk and chunk.strip()]
            return "\n\n".join(chunks)
        except Exception as e:
            logger.error(f"Error merging chunk results: {str(e)}")
            return ""

    async def save_document(self, document: Dict[str, Any], format: str = 'json') -> str:
        """Save document to output directory"""
        try:
            filename = f"{document['id']}.{format}"
            filepath = self.output_dir / filename

            if format == 'json':
                with open(filepath, 'w', encoding='utf-8') as f:
                    json.dump(document, f, indent=2)
            else:
                content = document.get('content', '')
                if isinstance(content, str):
                    with open(filepath, 'w', encoding='utf-8') as f:
                        f.write(content)
                else:
                    with open(filepath, 'wb') as f:
                        f.write(content)

            return str(filepath)

        except Exception as e:
            logger.error(f"Error saving document: {str(e)}")
            raise

    async def load_document(self, filepath: str) -> Dict[str, Any]:
        """Load document from file"""
        try:
            path = Path(filepath)
            if not path.exists():
                raise FileNotFoundError(f"Document not found: {filepath}")

            format = path.suffix[1:].lower()

            if format == 'json':
                with open(path, 'r', encoding='utf-8') as f:
                    document = json.load(f)
            else:
                mime_type = mimetypes.guess_type(filepath)[0]
                if mime_type and mime_type.startswith('text/'):
                    with open(path, 'r', encoding='utf-8') as f:
                        content = f.read()
                else:
                    with open(path, 'rb') as f:
                        content = f.read()

                document = {
                    'id': path.stem,
                    'content': content,
                    'format': format,
                    'mime_type': mime_type
                }

            document['metadata'] = {
                'loadedAt': datetime.now(UTC).isoformat(),
                'filepath': str(path),
                'size': path.stat().st_size
            }

            return document

        except Exception as e:
            logger.error(f"Error loading document: {str(e)}")
            raise

    async def convert_document(self, document: Dict[str, Any], target_format: str) -> Dict[str, Any]:
        """Convert document to target format"""
        try:
            current_format = document.get('format', 'json')

            if current_format == 'json' and target_format == 'text':
                content = json.dumps(document, indent=2)
                return {
                    'id': document['id'],
                    'content': content,
                    'format': 'text',
                    'mime_type': 'text/plain'
                }
            elif current_format == 'text' and target_format == 'json':
                try:
                    content = json.loads(document['content'])
                    return {
                        'id': document['id'],
                        'content': content,
                        'format': 'json',
                        'mime_type': 'application/json'
                    }
                except json.JSONDecodeError:
                    return {
                        'id': document['id'],
                        'content': document['content'],
                        'format': 'json',
                        'mime_type': 'application/json'
                    }
            else:
                raise ValueError(f"Unsupported conversion: {current_format} to {target_format}")

        except Exception as e:
            logger.error(f"Error converting document: {str(e)}")
            raise

    def _generate_document_id(self, document: Dict[str, Any]) -> str:
        """Generate unique document ID"""
        if 'content' in document:
            content = str(document['content'])
            return hashlib.md5(content.encode()).hexdigest()
        return f"doc_{int(datetime.now(UTC).timestamp())}"

    async def cleanup(self) -> None:
        """Clean up temporary files and cache"""
        try:
            self._clear_temp_directory()
            self.document_cache.clear()
        except Exception as e:
            logger.error(f"Error during cleanup: {str(e)}")

# Singleton factory for the document manager
def getDocumentManager():
    return DocumentManager.getInstance()