""" Document Manager Module for handling document operations and content extraction. """ import logging from typing import Dict, Any, List, Optional from datetime import datetime from modules.interfaces.serviceChatModel import ChatDocument, ChatContent from modules.workflow.documentProcessor import getDocumentContents import uuid import json import base64 logger = logging.getLogger(__name__) class DocumentManager: """Manager for document operations and content extraction.""" _instance = None @classmethod def getInstance(cls): """Return a singleton instance of the document manager.""" if cls._instance is None: cls._instance = cls() return cls._instance def __init__(self): """Initialize the document manager.""" if DocumentManager._instance is not None: raise RuntimeError("Singleton instance already exists - use getInstance()") self.service = None def initialize(self, service=None): """Initialize or update the manager with service references.""" if service: # Validate required interfaces required_interfaces = ['base', 'msft', 'google'] missing_interfaces = [] for interface in required_interfaces: if not hasattr(service, interface): missing_interfaces.append(interface) if missing_interfaces: logger.warning(f"Service container missing required interfaces: {', '.join(missing_interfaces)}") return False self.service = service return True async def extractContent(self, fileId: str) -> Optional[ChatDocument]: """ Extract content from a file. Args: fileId: ID of the file to extract content from Returns: ChatDocument object if successful, None otherwise """ try: # Get file content fileContent = await self.getFileContent(fileId) if not fileContent: return None # Get file metadata fileMetadata = await self.getFileMetadata(fileId) if not fileMetadata: return None # Create ChatDocument return ChatDocument( id=str(uuid.uuid4()), fileId=fileId, filename=fileMetadata.get("name", "Unknown"), fileSize=fileMetadata.get("size", 0), content=fileContent.decode('utf-8', errors='ignore'), mimeType=fileMetadata.get("mimeType", "text/plain") ) except Exception as e: logger.error(f"Error extracting content from file {fileId}: {str(e)}") return None async def getFileContent(self, fileId: str) -> Optional[bytes]: """Gets the content of a file.""" try: return self.service.functions.getFileData(fileId) except Exception as e: logger.error(f"Error getting file content for {fileId}: {str(e)}") return None async def getFileMetadata(self, fileId: str) -> Optional[Dict[str, Any]]: """Gets the metadata of a file.""" try: return self.service.functions.getFile(fileId) except Exception as e: logger.error(f"Error getting file metadata for {fileId}: {str(e)}") return None async def saveFile(self, filename: str, content: bytes, mimeType: str) -> Optional[int]: """ Save a new file. Args: filename: Name of the file content: File content as bytes mimeType: MIME type of the file Returns: File ID if successful, None otherwise """ try: return await self.service.base.saveFile(filename, content, mimeType) except Exception as e: logger.error(f"Error saving file {filename}: {str(e)}") return None async def deleteFile(self, fileId: str) -> bool: """Deletes a file.""" try: return self.service.functions.deleteFile(fileId) except Exception as e: logger.error(f"Error deleting file {fileId}: {str(e)}") return False async def convertFileRefToId(self, ref: str) -> Optional[int]: """ Convert agent file reference to file ID. Args: ref: File reference in format 'filename;id' or just 'id' Returns: File ID if successful, None otherwise """ try: # Extract file ID from reference format if isinstance(ref, str) and ';' in ref: return int(ref.split(';')[1]) return int(ref) except Exception as e: logger.error(f"Error converting file reference to ID: {str(e)}") return None async def convertFileIdToRef(self, fileId: str) -> Optional[str]: """ Convert file ID to agent file reference. Args: fileId: File ID to convert Returns: File reference in format 'filename;id' if successful, None otherwise """ try: file = await self.getFileMetadata(fileId) if not file: return None return f"{file['name']};{fileId}" except Exception as e: logger.error(f"Error converting file ID to reference: {str(e)}") return None async def convertDataFormat(self, data: Any, format: str) -> Any: """ Convert data between different formats. Args: data: Data to convert format: Target format ('json', 'base64', etc.) Returns: Converted data """ try: if format == 'json': if isinstance(data, str): return json.loads(data) return json.dumps(data) elif format == 'base64': if isinstance(data, str): return base64.b64encode(data.encode('utf-8')).decode('utf-8') return base64.b64encode(data).decode('utf-8') return data except Exception as e: logger.error(f"Error converting data format: {str(e)}") return data async def createAgentInputFileList(self, files: List[str]) -> List[Dict[str, Any]]: """ Create a list of input files for agent processing. Args: files: List of file references Returns: List of file objects with content """ try: inputFiles = [] for file in files: fileId = await self.convertFileRefToId(file) if fileId: fileData = await self.getFileMetadata(fileId) if fileData: content = await self.getFileContent(fileId) inputFiles.append({ 'id': fileId, 'name': fileData['name'], 'mimeType': fileData['mimeType'], 'content': content }) return inputFiles except Exception as e: logger.error(f"Error creating agent input file list: {str(e)}") return [] async def saveAgentOutputFiles(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Save output files from agent processing. Args: files: List of file objects with content Returns: List of saved file metadata """ try: savedFiles = [] for file in files: # Create file metadata fileMeta = await self.saveFile( filename=file['name'], content=file['content'], mimeType=file.get('mimeType', 'application/octet-stream') ) if fileMeta: savedFiles.append({ 'id': fileMeta, 'name': file['name'], 'mimeType': file.get('mimeType', 'application/octet-stream') }) return savedFiles except Exception as e: logger.error(f"Error saving agent output files: {str(e)}") return [] async def contentWithPrompt(self, document: Dict[str, Any], prompt: str) -> Optional[Dict[str, Any]]: """ Extract content from a document using AI with a specific prompt. Handles large files by processing in chunks and merging results. Args: document: Document object with file information prompt: Specific prompt for content extraction Returns: Dictionary with extracted content and metadata """ try: # First get the document content chat_doc = await self.extractContent(document.get('id')) if not chat_doc: return None # Prepare the content for AI processing content = chat_doc.content mime_type = chat_doc.mimeType # For large files, process in chunks if len(content) > 100000: # Arbitrary threshold, adjust as needed chunks = self._splitContentIntoChunks(content, mime_type) extracted_chunks = [] for chunk in chunks: # Process each chunk with AI chunk_result = await self._processContentChunk(chunk, prompt) if chunk_result: extracted_chunks.append(chunk_result) # Merge results return { "content": self._mergeChunkResults(extracted_chunks), "metadata": { "original_size": len(content), "chunks_processed": len(chunks), "mime_type": mime_type } } else: # Process single chunk result = await self._processContentChunk(content, prompt) return { "content": result, "metadata": { "original_size": len(content), "chunks_processed": 1, "mime_type": mime_type } } except Exception as e: logger.error(f"Error in contentWithPrompt: {str(e)}") return None def _splitContentIntoChunks(self, content: str, mime_type: str) -> List[str]: """ Split content into manageable chunks based on mime type. Args: content: Content to split mime_type: MIME type of the content Returns: List of content chunks """ try: if mime_type.startswith('text/'): # Split text content by paragraphs or sections return [chunk.strip() for chunk in content.split('\n\n') if chunk.strip()] elif mime_type == 'application/json': # Split JSON content by objects data = json.loads(content) if isinstance(data, list): return [json.dumps(item) for item in data] return [content] else: # Default chunking return [content[i:i+10000] for i in range(0, len(content), 10000)] except Exception as e: logger.error(f"Error splitting content: {str(e)}") return [content] async def _processContentChunk(self, chunk: str, prompt: str) -> Optional[str]: """ Process a single content chunk with AI. Args: chunk: Content chunk to process prompt: Extraction prompt Returns: Processed content """ try: # Create AI prompt ai_prompt = f""" Extract relevant information from this content based on the following prompt: PROMPT: {prompt} CONTENT: {chunk} Return ONLY the extracted information in a clear, concise format. """ # Get AI response response = await self.service.base.callAi([ {"role": "system", "content": "You are an expert at extracting relevant information from documents."}, {"role": "user", "content": ai_prompt} ]) return response.strip() except Exception as e: logger.error(f"Error processing content chunk: {str(e)}") return None def _mergeChunkResults(self, chunks: List[str]) -> str: """ Merge processed content chunks into a single result. Args: chunks: List of processed chunks Returns: Merged content """ try: # Remove duplicates and empty chunks chunks = [chunk for chunk in chunks if chunk and chunk.strip()] # Merge chunks with appropriate spacing return "\n\n".join(chunks) except Exception as e: logger.error(f"Error merging chunk results: {str(e)}") return "" # Singleton factory for the document manager def getDocumentManager(): return DocumentManager.getInstance()