174 lines
No EOL
5.7 KiB
Python
174 lines
No EOL
5.7 KiB
Python
"""
|
|
Document Manager Module for handling document operations and content extraction.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, Any, List, Optional
|
|
from datetime import datetime
|
|
from modules.interfaces.serviceChatModel import ChatDocument, ChatContent
|
|
from modules.workflow.documentProcessor import getDocumentContents
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DocumentManager:
|
|
"""Manager for document operations and content extraction."""
|
|
|
|
_instance = None
|
|
|
|
@classmethod
|
|
def getInstance(cls):
|
|
"""Return a singleton instance of the document manager."""
|
|
if cls._instance is None:
|
|
cls._instance = cls()
|
|
return cls._instance
|
|
|
|
def __init__(self):
|
|
"""Initialize the document manager."""
|
|
if DocumentManager._instance is not None:
|
|
raise RuntimeError("Singleton instance already exists - use getInstance()")
|
|
|
|
self.service = None
|
|
|
|
def initialize(self, service=None):
|
|
"""Initialize or update the manager with service references."""
|
|
if service:
|
|
# Validate required interfaces
|
|
required_interfaces = ['base', 'msft', 'google']
|
|
missing_interfaces = []
|
|
for interface in required_interfaces:
|
|
if not hasattr(service, interface):
|
|
missing_interfaces.append(interface)
|
|
|
|
if missing_interfaces:
|
|
logger.warning(f"Service container missing required interfaces: {', '.join(missing_interfaces)}")
|
|
return False
|
|
|
|
self.service = service
|
|
return True
|
|
|
|
async def extractContent(self, fileId: int) -> Optional[ChatDocument]:
|
|
"""
|
|
Extract content from a file.
|
|
|
|
Args:
|
|
fileId: ID of the file to process
|
|
|
|
Returns:
|
|
ChatDocument object with extracted content or None if processing failed
|
|
"""
|
|
try:
|
|
# Get file metadata and content from service
|
|
fileMetadata = await self.service.base.getFileMetadata(fileId)
|
|
fileContent = await self.service.base.getFileContent(fileId)
|
|
|
|
if not fileMetadata or not fileContent:
|
|
logger.error(f"Could not retrieve file data for fileId {fileId}")
|
|
return None
|
|
|
|
# Extract content using documentProcessor
|
|
contents = getDocumentContents(fileMetadata, fileContent)
|
|
|
|
# Create ChatDocument
|
|
return ChatDocument(
|
|
id=str(fileId), # Using fileId as document id
|
|
fileId=fileId,
|
|
filename=fileMetadata.get("name", "unknown"),
|
|
fileSize=fileMetadata.get("size", 0),
|
|
mimeType=fileMetadata.get("mimeType", "application/octet-stream"),
|
|
contents=contents
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content from file {fileId}: {str(e)}", exc_info=True)
|
|
return None
|
|
|
|
async def processFileIds(self, fileIds: List[int]) -> List[ChatDocument]:
|
|
"""
|
|
Process multiple files and extract their contents.
|
|
|
|
Args:
|
|
fileIds: List of file IDs to process
|
|
|
|
Returns:
|
|
List of ChatDocument objects
|
|
"""
|
|
documents = []
|
|
for fileId in fileIds:
|
|
try:
|
|
document = await self.extractContent(fileId)
|
|
if document:
|
|
documents.append(document)
|
|
except Exception as e:
|
|
logger.error(f"Error processing file {fileId}: {str(e)}")
|
|
continue
|
|
return documents
|
|
|
|
async def getFileContent(self, fileId: int) -> Optional[bytes]:
|
|
"""
|
|
Get raw file content.
|
|
|
|
Args:
|
|
fileId: ID of the file
|
|
|
|
Returns:
|
|
File content as bytes or None if not found
|
|
"""
|
|
try:
|
|
return await self.service.base.getFileContent(fileId)
|
|
except Exception as e:
|
|
logger.error(f"Error getting file content for {fileId}: {str(e)}")
|
|
return None
|
|
|
|
async def getFileMetadata(self, fileId: int) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Get file metadata.
|
|
|
|
Args:
|
|
fileId: ID of the file
|
|
|
|
Returns:
|
|
File metadata dictionary or None if not found
|
|
"""
|
|
try:
|
|
return await self.service.base.getFileMetadata(fileId)
|
|
except Exception as e:
|
|
logger.error(f"Error getting file metadata for {fileId}: {str(e)}")
|
|
return None
|
|
|
|
async def saveFile(self, filename: str, content: bytes, mimeType: str) -> Optional[int]:
|
|
"""
|
|
Save a new file.
|
|
|
|
Args:
|
|
filename: Name of the file
|
|
content: File content as bytes
|
|
mimeType: MIME type of the file
|
|
|
|
Returns:
|
|
File ID if successful, None otherwise
|
|
"""
|
|
try:
|
|
return await self.service.base.saveFile(filename, content, mimeType)
|
|
except Exception as e:
|
|
logger.error(f"Error saving file {filename}: {str(e)}")
|
|
return None
|
|
|
|
async def deleteFile(self, fileId: int) -> bool:
|
|
"""
|
|
Delete a file.
|
|
|
|
Args:
|
|
fileId: ID of the file to delete
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
return await self.service.base.deleteFile(fileId)
|
|
except Exception as e:
|
|
logger.error(f"Error deleting file {fileId}: {str(e)}")
|
|
return False
|
|
|
|
# Singleton factory for the document manager
|
|
def getDocumentManager():
|
|
return DocumentManager.getInstance() |