106 lines
3.5 KiB
Python
106 lines
3.5 KiB
Python
"""
|
|
Document Manager Module for handling document operations and content extraction.
|
|
"""
|
|
|
|
import base64
|
|
import logging
|
|
from typing import List, Optional, Dict, Any, Union
|
|
from pathlib import Path
|
|
import uuid
|
|
|
|
from modules.interfaces.serviceChatModel import (
|
|
ChatDocument,
|
|
TaskDocument,
|
|
ExtractedContent,
|
|
ContentItem,
|
|
ContentMetadata
|
|
)
|
|
from modules.workflow.serviceContainer import ServiceContainer
|
|
from modules.workflow.processorDocument import DocumentProcessor
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DocumentManager:
|
|
"""Manager for document operations and content extraction"""
|
|
|
|
def __init__(self, serviceContainer: ServiceContainer):
|
|
self.service = serviceContainer
|
|
self._processor = DocumentProcessor()
|
|
|
|
async def extractFromChatDocument(self, prompt: str, document: ChatDocument) -> ExtractedContent:
|
|
"""
|
|
Extract content from a ChatDocument with AI processing.
|
|
|
|
Args:
|
|
prompt: Prompt for AI content extraction
|
|
document: The ChatDocument to process
|
|
|
|
Returns:
|
|
ExtractedContent containing the processed content
|
|
"""
|
|
# Convert ChatDocument to TaskDocument
|
|
taskDoc = await self._convertToTaskDocument(document)
|
|
|
|
# Process document using processor
|
|
extractedContent = await self._processor.processDocument(taskDoc, prompt)
|
|
|
|
# Update the objectId and objectType to reference the original ChatDocument
|
|
extractedContent.objectId = document.id
|
|
extractedContent.objectType = "ChatDocument"
|
|
|
|
return extractedContent
|
|
|
|
async def extractFromTaskDocument(self, prompt: str, document: TaskDocument) -> ExtractedContent:
|
|
"""
|
|
Extract content directly from a task document.
|
|
|
|
Args:
|
|
prompt: The prompt to use for content extraction
|
|
document: The task document to extract content from
|
|
|
|
Returns:
|
|
ExtractedContent containing the processed content
|
|
|
|
Raises:
|
|
ValueError: If document is invalid
|
|
IOError: If file cannot be read
|
|
"""
|
|
try:
|
|
return await self._processor.processDocument(document, prompt)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting from task document: {str(e)}")
|
|
raise
|
|
|
|
async def _convertToTaskDocument(self, chatDoc: ChatDocument) -> TaskDocument:
|
|
"""
|
|
Convert a ChatDocument to a TaskDocument.
|
|
|
|
Args:
|
|
chatDoc: The chat document to convert
|
|
|
|
Returns:
|
|
TaskDocument containing the converted data
|
|
|
|
Raises:
|
|
ValueError: If document is invalid
|
|
IOError: If file cannot be read
|
|
"""
|
|
try:
|
|
# Get file content
|
|
fileContent = await self.service.functions.getFileData(chatDoc.fileId)
|
|
if not fileContent:
|
|
raise ValueError(f"Could not get content for file {chatDoc.fileId}")
|
|
|
|
# Convert to base64
|
|
base64Data = base64.b64encode(fileContent).decode('utf-8')
|
|
|
|
return TaskDocument(
|
|
id=str(uuid.uuid4()),
|
|
filename=chatDoc.filename,
|
|
fileSize=chatDoc.fileSize,
|
|
mimeType=chatDoc.mimeType,
|
|
data=base64Data
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error converting chat document to task document: {str(e)}")
|
|
raise
|