""" Document Manager Module for handling document operations and content extraction. """ import logging from modules.interfaces.interfaceChatModel import ( ChatDocument, ExtractedContent ) from modules.workflow.processorDocument import DocumentProcessor logger = logging.getLogger(__name__) class DocumentManager: """Manager for document operations and content extraction""" def __init__(self, serviceContainer): self.service = serviceContainer # Create processor without any dependencies self._processor = DocumentProcessor() async def extractContentFromDocument(self, prompt: str, document: ChatDocument) -> ExtractedContent: """Extract content from ChatDocument using prompt""" try: # Extract file data from ChatDocument if document.data: fileData = document.data.encode('utf-8') if isinstance(document.data, str) else document.data else: # Try to get file data from service container if document has fileId if hasattr(document, 'fileId') and document.fileId: fileData = self.service.getFileData(document.fileId) else: logger.error(f"No file data available in document: {document}") raise ValueError("No file data available in document") # Get filename and mime type from document filename = document.filename if hasattr(document, 'filename') else "document" mimeType = document.mimeType if hasattr(document, 'mimeType') else "application/octet-stream" # Process with processor extractedContent = await self._processor.processFileData( fileData=fileData, filename=filename, mimeType=mimeType, base64Encoded=False, prompt=prompt ) # Update objectId to match document ID extractedContent.objectId = document.id extractedContent.objectType = "ChatDocument" return extractedContent except Exception as e: logger.error(f"Error extracting from document: {str(e)}") raise async def extractContentFromFileData(self, prompt: str, fileData: bytes, filename: str, mimeType: str, base64Encoded: bool = False, documentId: str = None) -> ExtractedContent: """Extract content from file data directly using prompt""" try: return await self._processor.processFileData( fileData=fileData, filename=filename, mimeType=mimeType, base64Encoded=base64Encoded, prompt=prompt, documentId=documentId ) except Exception as e: logger.error(f"Error extracting from file data: {str(e)}") raise