gateway/modules/workflow/documentManager.py
ValueOn AG f3860723af wip
2025-06-08 03:12:43 +02:00

396 lines
No EOL
14 KiB
Python

"""
Document Manager Module for handling document operations and content extraction.
"""
import logging
from typing import Dict, Any, List, Optional
from datetime import datetime
from modules.interfaces.serviceChatModel import ChatDocument, ChatContent
from modules.workflow.documentProcessor import getDocumentContents
import uuid
import json
import base64
logger = logging.getLogger(__name__)
class DocumentManager:
"""Manager for document operations and content extraction."""
_instance = None
@classmethod
def getInstance(cls):
"""Return a singleton instance of the document manager."""
if cls._instance is None:
cls._instance = cls()
return cls._instance
def __init__(self):
"""Initialize the document manager."""
if DocumentManager._instance is not None:
raise RuntimeError("Singleton instance already exists - use getInstance()")
self.service = None
def initialize(self, service=None):
"""Initialize or update the manager with service references."""
if service:
# Validate required interfaces
required_interfaces = ['base', 'msft', 'google']
missing_interfaces = []
for interface in required_interfaces:
if not hasattr(service, interface):
missing_interfaces.append(interface)
if missing_interfaces:
logger.warning(f"Service container missing required interfaces: {', '.join(missing_interfaces)}")
return False
self.service = service
return True
async def extractContent(self, fileId: str) -> Optional[ChatDocument]:
"""
Extract content from a file.
Args:
fileId: ID of the file to extract content from
Returns:
ChatDocument object if successful, None otherwise
"""
try:
# Get file content
fileContent = await self.getFileContent(fileId)
if not fileContent:
return None
# Get file metadata
fileMetadata = await self.getFileMetadata(fileId)
if not fileMetadata:
return None
# Create ChatDocument
return ChatDocument(
id=str(uuid.uuid4()),
fileId=fileId,
filename=fileMetadata.get("name", "Unknown"),
fileSize=fileMetadata.get("size", 0),
content=fileContent.decode('utf-8', errors='ignore'),
mimeType=fileMetadata.get("mimeType", "text/plain")
)
except Exception as e:
logger.error(f"Error extracting content from file {fileId}: {str(e)}")
return None
async def getFileContent(self, fileId: str) -> Optional[bytes]:
"""Gets the content of a file."""
try:
return self.service.functions.getFileData(fileId)
except Exception as e:
logger.error(f"Error getting file content for {fileId}: {str(e)}")
return None
async def getFileMetadata(self, fileId: str) -> Optional[Dict[str, Any]]:
"""Gets the metadata of a file."""
try:
return self.service.functions.getFile(fileId)
except Exception as e:
logger.error(f"Error getting file metadata for {fileId}: {str(e)}")
return None
async def saveFile(self, filename: str, content: bytes, mimeType: str) -> Optional[int]:
"""
Save a new file.
Args:
filename: Name of the file
content: File content as bytes
mimeType: MIME type of the file
Returns:
File ID if successful, None otherwise
"""
try:
return await self.service.base.saveFile(filename, content, mimeType)
except Exception as e:
logger.error(f"Error saving file {filename}: {str(e)}")
return None
async def deleteFile(self, fileId: str) -> bool:
"""Deletes a file."""
try:
return self.service.functions.deleteFile(fileId)
except Exception as e:
logger.error(f"Error deleting file {fileId}: {str(e)}")
return False
async def convertFileRefToId(self, ref: str) -> Optional[int]:
"""
Convert agent file reference to file ID.
Args:
ref: File reference in format 'filename;id' or just 'id'
Returns:
File ID if successful, None otherwise
"""
try:
# Extract file ID from reference format
if isinstance(ref, str) and ';' in ref:
return int(ref.split(';')[1])
return int(ref)
except Exception as e:
logger.error(f"Error converting file reference to ID: {str(e)}")
return None
async def convertFileIdToRef(self, fileId: str) -> Optional[str]:
"""
Convert file ID to agent file reference.
Args:
fileId: File ID to convert
Returns:
File reference in format 'filename;id' if successful, None otherwise
"""
try:
file = await self.getFileMetadata(fileId)
if not file:
return None
return f"{file['name']};{fileId}"
except Exception as e:
logger.error(f"Error converting file ID to reference: {str(e)}")
return None
async def convertDataFormat(self, data: Any, format: str) -> Any:
"""
Convert data between different formats.
Args:
data: Data to convert
format: Target format ('json', 'base64', etc.)
Returns:
Converted data
"""
try:
if format == 'json':
if isinstance(data, str):
return json.loads(data)
return json.dumps(data)
elif format == 'base64':
if isinstance(data, str):
return base64.b64encode(data.encode('utf-8')).decode('utf-8')
return base64.b64encode(data).decode('utf-8')
return data
except Exception as e:
logger.error(f"Error converting data format: {str(e)}")
return data
async def createAgentInputFileList(self, files: List[str]) -> List[Dict[str, Any]]:
"""
Create a list of input files for agent processing.
Args:
files: List of file references
Returns:
List of file objects with content
"""
try:
inputFiles = []
for file in files:
fileId = await self.convertFileRefToId(file)
if fileId:
fileData = await self.getFileMetadata(fileId)
if fileData:
content = await self.getFileContent(fileId)
inputFiles.append({
'id': fileId,
'name': fileData['name'],
'mimeType': fileData['mimeType'],
'content': content
})
return inputFiles
except Exception as e:
logger.error(f"Error creating agent input file list: {str(e)}")
return []
async def saveAgentOutputFiles(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Save output files from agent processing.
Args:
files: List of file objects with content
Returns:
List of saved file metadata
"""
try:
savedFiles = []
for file in files:
# Create file metadata
fileMeta = await self.saveFile(
filename=file['name'],
content=file['content'],
mimeType=file.get('mimeType', 'application/octet-stream')
)
if fileMeta:
savedFiles.append({
'id': fileMeta,
'name': file['name'],
'mimeType': file.get('mimeType', 'application/octet-stream')
})
return savedFiles
except Exception as e:
logger.error(f"Error saving agent output files: {str(e)}")
return []
async def contentWithPrompt(self, document: Dict[str, Any], prompt: str) -> Optional[Dict[str, Any]]:
"""
Extract content from a document using AI with a specific prompt.
Handles large files by processing in chunks and merging results.
Args:
document: Document object with file information
prompt: Specific prompt for content extraction
Returns:
Dictionary with extracted content and metadata
"""
try:
# First get the document content
chat_doc = await self.extractContent(document.get('id'))
if not chat_doc:
return None
# Prepare the content for AI processing
content = chat_doc.content
mime_type = chat_doc.mimeType
# For large files, process in chunks
if len(content) > 100000: # Arbitrary threshold, adjust as needed
chunks = self._splitContentIntoChunks(content, mime_type)
extracted_chunks = []
for chunk in chunks:
# Process each chunk with AI
chunk_result = await self._processContentChunk(chunk, prompt)
if chunk_result:
extracted_chunks.append(chunk_result)
# Merge results
return {
"content": self._mergeChunkResults(extracted_chunks),
"metadata": {
"original_size": len(content),
"chunks_processed": len(chunks),
"mime_type": mime_type
}
}
else:
# Process single chunk
result = await self._processContentChunk(content, prompt)
return {
"content": result,
"metadata": {
"original_size": len(content),
"chunks_processed": 1,
"mime_type": mime_type
}
}
except Exception as e:
logger.error(f"Error in contentWithPrompt: {str(e)}")
return None
def _splitContentIntoChunks(self, content: str, mime_type: str) -> List[str]:
"""
Split content into manageable chunks based on mime type.
Args:
content: Content to split
mime_type: MIME type of the content
Returns:
List of content chunks
"""
try:
if mime_type.startswith('text/'):
# Split text content by paragraphs or sections
return [chunk.strip() for chunk in content.split('\n\n') if chunk.strip()]
elif mime_type == 'application/json':
# Split JSON content by objects
data = json.loads(content)
if isinstance(data, list):
return [json.dumps(item) for item in data]
return [content]
else:
# Default chunking
return [content[i:i+10000] for i in range(0, len(content), 10000)]
except Exception as e:
logger.error(f"Error splitting content: {str(e)}")
return [content]
async def _processContentChunk(self, chunk: str, prompt: str) -> Optional[str]:
"""
Process a single content chunk with AI.
Args:
chunk: Content chunk to process
prompt: Extraction prompt
Returns:
Processed content
"""
try:
# Create AI prompt
ai_prompt = f"""
Extract relevant information from this content based on the following prompt:
PROMPT: {prompt}
CONTENT:
{chunk}
Return ONLY the extracted information in a clear, concise format.
"""
# Get AI response
response = await self.service.base.callAi([
{"role": "system", "content": "You are an expert at extracting relevant information from documents."},
{"role": "user", "content": ai_prompt}
])
return response.strip()
except Exception as e:
logger.error(f"Error processing content chunk: {str(e)}")
return None
def _mergeChunkResults(self, chunks: List[str]) -> str:
"""
Merge processed content chunks into a single result.
Args:
chunks: List of processed chunks
Returns:
Merged content
"""
try:
# Remove duplicates and empty chunks
chunks = [chunk for chunk in chunks if chunk and chunk.strip()]
# Merge chunks with appropriate spacing
return "\n\n".join(chunks)
except Exception as e:
logger.error(f"Error merging chunk results: {str(e)}")
return ""
# Singleton factory for the document manager
def getDocumentManager():
return DocumentManager.getInstance()