gateway/modules/workflow/managerDocument.py
2025-07-10 16:13:05 +02:00

73 lines
2.9 KiB
Python

"""
Document Manager Module for handling document operations and content extraction.
"""
import logging
from modules.interfaces.interfaceChatModel import (
ChatDocument,
ExtractedContent
)
from modules.workflow.processorDocument import DocumentProcessor
logger = logging.getLogger(__name__)
class DocumentManager:
"""Manager for document operations and content extraction"""
def __init__(self, serviceContainer):
self.service = serviceContainer
# Create processor with service container for AI calls
self._processor = DocumentProcessor(serviceContainer)
async def extractContentFromDocument(self, prompt: str, document: ChatDocument) -> ExtractedContent:
"""Extract content from ChatDocument using prompt"""
try:
# Extract file data from ChatDocument
if document.data:
fileData = document.data.encode('utf-8') if isinstance(document.data, str) else document.data
else:
# Try to get file data from service container if document has fileId
if hasattr(document, 'fileId') and document.fileId:
fileData = self.service.getFileData(document.fileId)
else:
logger.error(f"No file data available in document: {document}")
raise ValueError("No file data available in document")
# Get filename and mime type from document
filename = document.filename if hasattr(document, 'filename') else "document"
mimeType = document.mimeType if hasattr(document, 'mimeType') else "application/octet-stream"
# Process with processor
extractedContent = await self._processor.processFileData(
fileData=fileData,
filename=filename,
mimeType=mimeType,
base64Encoded=False,
prompt=prompt
)
# Update objectId to match document ID
extractedContent.objectId = document.id
extractedContent.objectType = "ChatDocument"
return extractedContent
except Exception as e:
logger.error(f"Error extracting from document: {str(e)}")
raise
async def extractContentFromFileData(self, prompt: str, fileData: bytes, filename: str, mimeType: str, base64Encoded: bool = False, documentId: str = None) -> ExtractedContent:
"""Extract content from file data directly using prompt"""
try:
return await self._processor.processFileData(
fileData=fileData,
filename=filename,
mimeType=mimeType,
base64Encoded=base64Encoded,
prompt=prompt,
documentId=documentId
)
except Exception as e:
logger.error(f"Error extracting from file data: {str(e)}")
raise