gateway/modules/workflow/documentManager.py

174 lines
No EOL
5.7 KiB
Python

"""
Document Manager Module for handling document operations and content extraction.
"""
import logging
from typing import Dict, Any, List, Optional
from datetime import datetime
from modules.interfaces.serviceChatModel import ChatDocument, ChatContent
from modules.workflow.documentProcessor import getDocumentContents
logger = logging.getLogger(__name__)
class DocumentManager:
"""Manager for document operations and content extraction."""
_instance = None
@classmethod
def getInstance(cls):
"""Return a singleton instance of the document manager."""
if cls._instance is None:
cls._instance = cls()
return cls._instance
def __init__(self):
"""Initialize the document manager."""
if DocumentManager._instance is not None:
raise RuntimeError("Singleton instance already exists - use getInstance()")
self.service = None
def initialize(self, service=None):
"""Initialize or update the manager with service references."""
if service:
# Validate required interfaces
required_interfaces = ['base', 'msft', 'google']
missing_interfaces = []
for interface in required_interfaces:
if not hasattr(service, interface):
missing_interfaces.append(interface)
if missing_interfaces:
logger.warning(f"Service container missing required interfaces: {', '.join(missing_interfaces)}")
return False
self.service = service
return True
async def extractContent(self, fileId: int) -> Optional[ChatDocument]:
"""
Extract content from a file.
Args:
fileId: ID of the file to process
Returns:
ChatDocument object with extracted content or None if processing failed
"""
try:
# Get file metadata and content from service
fileMetadata = await self.service.base.getFileMetadata(fileId)
fileContent = await self.service.base.getFileContent(fileId)
if not fileMetadata or not fileContent:
logger.error(f"Could not retrieve file data for fileId {fileId}")
return None
# Extract content using documentProcessor
contents = getDocumentContents(fileMetadata, fileContent)
# Create ChatDocument
return ChatDocument(
id=str(fileId), # Using fileId as document id
fileId=fileId,
filename=fileMetadata.get("name", "unknown"),
fileSize=fileMetadata.get("size", 0),
mimeType=fileMetadata.get("mimeType", "application/octet-stream"),
contents=contents
)
except Exception as e:
logger.error(f"Error extracting content from file {fileId}: {str(e)}", exc_info=True)
return None
async def processFileIds(self, fileIds: List[int]) -> List[ChatDocument]:
"""
Process multiple files and extract their contents.
Args:
fileIds: List of file IDs to process
Returns:
List of ChatDocument objects
"""
documents = []
for fileId in fileIds:
try:
document = await self.extractContent(fileId)
if document:
documents.append(document)
except Exception as e:
logger.error(f"Error processing file {fileId}: {str(e)}")
continue
return documents
async def getFileContent(self, fileId: int) -> Optional[bytes]:
"""
Get raw file content.
Args:
fileId: ID of the file
Returns:
File content as bytes or None if not found
"""
try:
return await self.service.base.getFileContent(fileId)
except Exception as e:
logger.error(f"Error getting file content for {fileId}: {str(e)}")
return None
async def getFileMetadata(self, fileId: int) -> Optional[Dict[str, Any]]:
"""
Get file metadata.
Args:
fileId: ID of the file
Returns:
File metadata dictionary or None if not found
"""
try:
return await self.service.base.getFileMetadata(fileId)
except Exception as e:
logger.error(f"Error getting file metadata for {fileId}: {str(e)}")
return None
async def saveFile(self, filename: str, content: bytes, mimeType: str) -> Optional[int]:
"""
Save a new file.
Args:
filename: Name of the file
content: File content as bytes
mimeType: MIME type of the file
Returns:
File ID if successful, None otherwise
"""
try:
return await self.service.base.saveFile(filename, content, mimeType)
except Exception as e:
logger.error(f"Error saving file {filename}: {str(e)}")
return None
async def deleteFile(self, fileId: int) -> bool:
"""
Delete a file.
Args:
fileId: ID of the file to delete
Returns:
True if successful, False otherwise
"""
try:
return await self.service.base.deleteFile(fileId)
except Exception as e:
logger.error(f"Error deleting file {fileId}: {str(e)}")
return False
# Singleton factory for the document manager
def getDocumentManager():
return DocumentManager.getInstance()