73 lines
2.9 KiB
Python
73 lines
2.9 KiB
Python
"""
|
|
Document Manager Module for handling document operations and content extraction.
|
|
"""
|
|
|
|
import logging
|
|
|
|
from modules.interfaces.interfaceChatModel import (
|
|
ChatDocument,
|
|
ExtractedContent
|
|
)
|
|
from modules.workflow.processorDocument import DocumentProcessor
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DocumentManager:
|
|
"""Manager for document operations and content extraction"""
|
|
|
|
def __init__(self, serviceContainer):
|
|
self.service = serviceContainer
|
|
# Create processor without any dependencies
|
|
self._processor = DocumentProcessor()
|
|
|
|
async def extractContentFromDocument(self, prompt: str, document: ChatDocument) -> ExtractedContent:
|
|
"""Extract content from ChatDocument using prompt"""
|
|
try:
|
|
# Extract file data from ChatDocument
|
|
if document.data:
|
|
fileData = document.data.encode('utf-8') if isinstance(document.data, str) else document.data
|
|
else:
|
|
# Try to get file data from service container if document has fileId
|
|
if hasattr(document, 'fileId') and document.fileId:
|
|
fileData = self.service.getFileData(document.fileId)
|
|
else:
|
|
logger.error(f"No file data available in document: {document}")
|
|
raise ValueError("No file data available in document")
|
|
|
|
# Get filename and mime type from document
|
|
filename = document.filename if hasattr(document, 'filename') else "document"
|
|
mimeType = document.mimeType if hasattr(document, 'mimeType') else "application/octet-stream"
|
|
|
|
# Process with processor
|
|
extractedContent = await self._processor.processFileData(
|
|
fileData=fileData,
|
|
filename=filename,
|
|
mimeType=mimeType,
|
|
base64Encoded=False,
|
|
prompt=prompt
|
|
)
|
|
|
|
# Update objectId to match document ID
|
|
extractedContent.objectId = document.id
|
|
extractedContent.objectType = "ChatDocument"
|
|
|
|
return extractedContent
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting from document: {str(e)}")
|
|
raise
|
|
|
|
async def extractContentFromFileData(self, prompt: str, fileData: bytes, filename: str, mimeType: str, base64Encoded: bool = False, documentId: str = None) -> ExtractedContent:
|
|
"""Extract content from file data directly using prompt"""
|
|
try:
|
|
return await self._processor.processFileData(
|
|
fileData=fileData,
|
|
filename=filename,
|
|
mimeType=mimeType,
|
|
base64Encoded=base64Encoded,
|
|
prompt=prompt,
|
|
documentId=documentId
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting from file data: {str(e)}")
|
|
raise
|