gateway/modules/workflow/documentService.py
2025-06-10 18:19:33 +02:00

106 lines
3.5 KiB
Python

"""
Document Manager Module for handling document operations and content extraction.
"""
import base64
import logging
from typing import List, Optional, Dict, Any, Union
from pathlib import Path
import uuid
from modules.interfaces.serviceChatModel import (
ChatDocument,
TaskDocument,
ExtractedContent,
ContentItem,
ContentMetadata
)
from modules.workflow.serviceContainer import ServiceContainer
from modules.workflow.processorDocument import DocumentProcessor
logger = logging.getLogger(__name__)
class DocumentManager:
"""Manager for document operations and content extraction"""
def __init__(self, serviceContainer: ServiceContainer):
self.service = serviceContainer
self._processor = DocumentProcessor()
async def extractFromChatDocument(self, prompt: str, document: ChatDocument) -> ExtractedContent:
"""
Extract content from a ChatDocument with AI processing.
Args:
prompt: Prompt for AI content extraction
document: The ChatDocument to process
Returns:
ExtractedContent containing the processed content
"""
# Convert ChatDocument to TaskDocument
taskDoc = await self._convertToTaskDocument(document)
# Process document using processor
extractedContent = await self._processor.processDocument(taskDoc, prompt)
# Update the objectId and objectType to reference the original ChatDocument
extractedContent.objectId = document.id
extractedContent.objectType = "ChatDocument"
return extractedContent
async def extractFromTaskDocument(self, prompt: str, document: TaskDocument) -> ExtractedContent:
"""
Extract content directly from a task document.
Args:
prompt: The prompt to use for content extraction
document: The task document to extract content from
Returns:
ExtractedContent containing the processed content
Raises:
ValueError: If document is invalid
IOError: If file cannot be read
"""
try:
return await self._processor.processDocument(document, prompt)
except Exception as e:
logger.error(f"Error extracting from task document: {str(e)}")
raise
async def _convertToTaskDocument(self, chatDoc: ChatDocument) -> TaskDocument:
"""
Convert a ChatDocument to a TaskDocument.
Args:
chatDoc: The chat document to convert
Returns:
TaskDocument containing the converted data
Raises:
ValueError: If document is invalid
IOError: If file cannot be read
"""
try:
# Get file content
fileContent = await self.service.functions.getFileData(chatDoc.fileId)
if not fileContent:
raise ValueError(f"Could not get content for file {chatDoc.fileId}")
# Convert to base64
base64Data = base64.b64encode(fileContent).decode('utf-8')
return TaskDocument(
id=str(uuid.uuid4()),
filename=chatDoc.filename,
fileSize=chatDoc.fileSize,
mimeType=chatDoc.mimeType,
data=base64Data
)
except Exception as e:
logger.error(f"Error converting chat document to task document: {str(e)}")
raise