""" Document Manager Module for handling document operations and content extraction. """ import base64 import logging from typing import List, Optional, Dict, Any, Union from pathlib import Path import uuid from modules.interfaces.serviceChatModel import ( ChatDocument, TaskDocument, ExtractedContent, ContentItem, ContentMetadata ) from modules.workflow.serviceContainer import ServiceContainer from modules.workflow.processorDocument import DocumentProcessor logger = logging.getLogger(__name__) class DocumentManager: """Manager for document operations and content extraction""" def __init__(self, serviceContainer: ServiceContainer): self.service = serviceContainer self._processor = DocumentProcessor() async def extractFromChatDocument(self, prompt: str, document: ChatDocument) -> ExtractedContent: """ Extract content from a ChatDocument with AI processing. Args: prompt: Prompt for AI content extraction document: The ChatDocument to process Returns: ExtractedContent containing the processed content """ # Convert ChatDocument to TaskDocument taskDoc = await self._convertToTaskDocument(document) # Process document using processor extractedContent = await self._processor.processDocument(taskDoc, prompt) # Update the objectId and objectType to reference the original ChatDocument extractedContent.objectId = document.id extractedContent.objectType = "ChatDocument" return extractedContent async def extractFromTaskDocument(self, prompt: str, document: TaskDocument) -> ExtractedContent: """ Extract content directly from a task document. Args: prompt: The prompt to use for content extraction document: The task document to extract content from Returns: ExtractedContent containing the processed content Raises: ValueError: If document is invalid IOError: If file cannot be read """ try: return await self._processor.processDocument(document, prompt) except Exception as e: logger.error(f"Error extracting from task document: {str(e)}") raise async def _convertToTaskDocument(self, chatDoc: ChatDocument) -> TaskDocument: """ Convert a ChatDocument to a TaskDocument. Args: chatDoc: The chat document to convert Returns: TaskDocument containing the converted data Raises: ValueError: If document is invalid IOError: If file cannot be read """ try: # Get file content fileContent = await self.service.functions.getFileData(chatDoc.fileId) if not fileContent: raise ValueError(f"Could not get content for file {chatDoc.fileId}") # Convert to base64 base64Data = base64.b64encode(fileContent).decode('utf-8') return TaskDocument( id=str(uuid.uuid4()), filename=chatDoc.filename, fileSize=chatDoc.fileSize, mimeType=chatDoc.mimeType, data=base64Data ) except Exception as e: logger.error(f"Error converting chat document to task document: {str(e)}") raise