396 lines
No EOL
14 KiB
Python
396 lines
No EOL
14 KiB
Python
"""
|
|
Document Manager Module for handling document operations and content extraction.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, Any, List, Optional
|
|
from datetime import datetime
|
|
from modules.interfaces.serviceChatModel import ChatDocument, ChatContent
|
|
from modules.workflow.documentProcessor import getDocumentContents
|
|
import uuid
|
|
import json
|
|
import base64
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DocumentManager:
|
|
"""Manager for document operations and content extraction."""
|
|
|
|
_instance = None
|
|
|
|
@classmethod
|
|
def getInstance(cls):
|
|
"""Return a singleton instance of the document manager."""
|
|
if cls._instance is None:
|
|
cls._instance = cls()
|
|
return cls._instance
|
|
|
|
def __init__(self):
|
|
"""Initialize the document manager."""
|
|
if DocumentManager._instance is not None:
|
|
raise RuntimeError("Singleton instance already exists - use getInstance()")
|
|
|
|
self.service = None
|
|
|
|
def initialize(self, service=None):
|
|
"""Initialize or update the manager with service references."""
|
|
if service:
|
|
# Validate required interfaces
|
|
required_interfaces = ['base', 'msft', 'google']
|
|
missing_interfaces = []
|
|
for interface in required_interfaces:
|
|
if not hasattr(service, interface):
|
|
missing_interfaces.append(interface)
|
|
|
|
if missing_interfaces:
|
|
logger.warning(f"Service container missing required interfaces: {', '.join(missing_interfaces)}")
|
|
return False
|
|
|
|
self.service = service
|
|
return True
|
|
|
|
async def extractContent(self, fileId: str) -> Optional[ChatDocument]:
|
|
"""
|
|
Extract content from a file.
|
|
|
|
Args:
|
|
fileId: ID of the file to extract content from
|
|
|
|
Returns:
|
|
ChatDocument object if successful, None otherwise
|
|
"""
|
|
try:
|
|
# Get file content
|
|
fileContent = await self.getFileContent(fileId)
|
|
if not fileContent:
|
|
return None
|
|
|
|
# Get file metadata
|
|
fileMetadata = await self.getFileMetadata(fileId)
|
|
if not fileMetadata:
|
|
return None
|
|
|
|
# Create ChatDocument
|
|
return ChatDocument(
|
|
id=str(uuid.uuid4()),
|
|
fileId=fileId,
|
|
filename=fileMetadata.get("name", "Unknown"),
|
|
fileSize=fileMetadata.get("size", 0),
|
|
content=fileContent.decode('utf-8', errors='ignore'),
|
|
mimeType=fileMetadata.get("mimeType", "text/plain")
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content from file {fileId}: {str(e)}")
|
|
return None
|
|
|
|
async def getFileContent(self, fileId: str) -> Optional[bytes]:
|
|
"""Gets the content of a file."""
|
|
try:
|
|
return self.service.functions.getFileData(fileId)
|
|
except Exception as e:
|
|
logger.error(f"Error getting file content for {fileId}: {str(e)}")
|
|
return None
|
|
|
|
async def getFileMetadata(self, fileId: str) -> Optional[Dict[str, Any]]:
|
|
"""Gets the metadata of a file."""
|
|
try:
|
|
return self.service.functions.getFile(fileId)
|
|
except Exception as e:
|
|
logger.error(f"Error getting file metadata for {fileId}: {str(e)}")
|
|
return None
|
|
|
|
async def saveFile(self, filename: str, content: bytes, mimeType: str) -> Optional[int]:
|
|
"""
|
|
Save a new file.
|
|
|
|
Args:
|
|
filename: Name of the file
|
|
content: File content as bytes
|
|
mimeType: MIME type of the file
|
|
|
|
Returns:
|
|
File ID if successful, None otherwise
|
|
"""
|
|
try:
|
|
return await self.service.base.saveFile(filename, content, mimeType)
|
|
except Exception as e:
|
|
logger.error(f"Error saving file {filename}: {str(e)}")
|
|
return None
|
|
|
|
async def deleteFile(self, fileId: str) -> bool:
|
|
"""Deletes a file."""
|
|
try:
|
|
return self.service.functions.deleteFile(fileId)
|
|
except Exception as e:
|
|
logger.error(f"Error deleting file {fileId}: {str(e)}")
|
|
return False
|
|
|
|
async def convertFileRefToId(self, ref: str) -> Optional[int]:
|
|
"""
|
|
Convert agent file reference to file ID.
|
|
|
|
Args:
|
|
ref: File reference in format 'filename;id' or just 'id'
|
|
|
|
Returns:
|
|
File ID if successful, None otherwise
|
|
"""
|
|
try:
|
|
# Extract file ID from reference format
|
|
if isinstance(ref, str) and ';' in ref:
|
|
return int(ref.split(';')[1])
|
|
return int(ref)
|
|
except Exception as e:
|
|
logger.error(f"Error converting file reference to ID: {str(e)}")
|
|
return None
|
|
|
|
async def convertFileIdToRef(self, fileId: str) -> Optional[str]:
|
|
"""
|
|
Convert file ID to agent file reference.
|
|
|
|
Args:
|
|
fileId: File ID to convert
|
|
|
|
Returns:
|
|
File reference in format 'filename;id' if successful, None otherwise
|
|
"""
|
|
try:
|
|
file = await self.getFileMetadata(fileId)
|
|
if not file:
|
|
return None
|
|
return f"{file['name']};{fileId}"
|
|
except Exception as e:
|
|
logger.error(f"Error converting file ID to reference: {str(e)}")
|
|
return None
|
|
|
|
async def convertDataFormat(self, data: Any, format: str) -> Any:
|
|
"""
|
|
Convert data between different formats.
|
|
|
|
Args:
|
|
data: Data to convert
|
|
format: Target format ('json', 'base64', etc.)
|
|
|
|
Returns:
|
|
Converted data
|
|
"""
|
|
try:
|
|
if format == 'json':
|
|
if isinstance(data, str):
|
|
return json.loads(data)
|
|
return json.dumps(data)
|
|
elif format == 'base64':
|
|
if isinstance(data, str):
|
|
return base64.b64encode(data.encode('utf-8')).decode('utf-8')
|
|
return base64.b64encode(data).decode('utf-8')
|
|
return data
|
|
except Exception as e:
|
|
logger.error(f"Error converting data format: {str(e)}")
|
|
return data
|
|
|
|
async def createAgentInputFileList(self, files: List[str]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Create a list of input files for agent processing.
|
|
|
|
Args:
|
|
files: List of file references
|
|
|
|
Returns:
|
|
List of file objects with content
|
|
"""
|
|
try:
|
|
inputFiles = []
|
|
for file in files:
|
|
fileId = await self.convertFileRefToId(file)
|
|
if fileId:
|
|
fileData = await self.getFileMetadata(fileId)
|
|
if fileData:
|
|
content = await self.getFileContent(fileId)
|
|
inputFiles.append({
|
|
'id': fileId,
|
|
'name': fileData['name'],
|
|
'mimeType': fileData['mimeType'],
|
|
'content': content
|
|
})
|
|
return inputFiles
|
|
except Exception as e:
|
|
logger.error(f"Error creating agent input file list: {str(e)}")
|
|
return []
|
|
|
|
async def saveAgentOutputFiles(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Save output files from agent processing.
|
|
|
|
Args:
|
|
files: List of file objects with content
|
|
|
|
Returns:
|
|
List of saved file metadata
|
|
"""
|
|
try:
|
|
savedFiles = []
|
|
for file in files:
|
|
# Create file metadata
|
|
fileMeta = await self.saveFile(
|
|
filename=file['name'],
|
|
content=file['content'],
|
|
mimeType=file.get('mimeType', 'application/octet-stream')
|
|
)
|
|
|
|
if fileMeta:
|
|
savedFiles.append({
|
|
'id': fileMeta,
|
|
'name': file['name'],
|
|
'mimeType': file.get('mimeType', 'application/octet-stream')
|
|
})
|
|
return savedFiles
|
|
except Exception as e:
|
|
logger.error(f"Error saving agent output files: {str(e)}")
|
|
return []
|
|
|
|
async def contentWithPrompt(self, document: Dict[str, Any], prompt: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Extract content from a document using AI with a specific prompt.
|
|
Handles large files by processing in chunks and merging results.
|
|
|
|
Args:
|
|
document: Document object with file information
|
|
prompt: Specific prompt for content extraction
|
|
|
|
Returns:
|
|
Dictionary with extracted content and metadata
|
|
"""
|
|
try:
|
|
# First get the document content
|
|
chat_doc = await self.extractContent(document.get('id'))
|
|
if not chat_doc:
|
|
return None
|
|
|
|
# Prepare the content for AI processing
|
|
content = chat_doc.content
|
|
mime_type = chat_doc.mimeType
|
|
|
|
# For large files, process in chunks
|
|
if len(content) > 100000: # Arbitrary threshold, adjust as needed
|
|
chunks = self._splitContentIntoChunks(content, mime_type)
|
|
extracted_chunks = []
|
|
|
|
for chunk in chunks:
|
|
# Process each chunk with AI
|
|
chunk_result = await self._processContentChunk(chunk, prompt)
|
|
if chunk_result:
|
|
extracted_chunks.append(chunk_result)
|
|
|
|
# Merge results
|
|
return {
|
|
"content": self._mergeChunkResults(extracted_chunks),
|
|
"metadata": {
|
|
"original_size": len(content),
|
|
"chunks_processed": len(chunks),
|
|
"mime_type": mime_type
|
|
}
|
|
}
|
|
else:
|
|
# Process single chunk
|
|
result = await self._processContentChunk(content, prompt)
|
|
return {
|
|
"content": result,
|
|
"metadata": {
|
|
"original_size": len(content),
|
|
"chunks_processed": 1,
|
|
"mime_type": mime_type
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in contentWithPrompt: {str(e)}")
|
|
return None
|
|
|
|
def _splitContentIntoChunks(self, content: str, mime_type: str) -> List[str]:
|
|
"""
|
|
Split content into manageable chunks based on mime type.
|
|
|
|
Args:
|
|
content: Content to split
|
|
mime_type: MIME type of the content
|
|
|
|
Returns:
|
|
List of content chunks
|
|
"""
|
|
try:
|
|
if mime_type.startswith('text/'):
|
|
# Split text content by paragraphs or sections
|
|
return [chunk.strip() for chunk in content.split('\n\n') if chunk.strip()]
|
|
elif mime_type == 'application/json':
|
|
# Split JSON content by objects
|
|
data = json.loads(content)
|
|
if isinstance(data, list):
|
|
return [json.dumps(item) for item in data]
|
|
return [content]
|
|
else:
|
|
# Default chunking
|
|
return [content[i:i+10000] for i in range(0, len(content), 10000)]
|
|
except Exception as e:
|
|
logger.error(f"Error splitting content: {str(e)}")
|
|
return [content]
|
|
|
|
async def _processContentChunk(self, chunk: str, prompt: str) -> Optional[str]:
|
|
"""
|
|
Process a single content chunk with AI.
|
|
|
|
Args:
|
|
chunk: Content chunk to process
|
|
prompt: Extraction prompt
|
|
|
|
Returns:
|
|
Processed content
|
|
"""
|
|
try:
|
|
# Create AI prompt
|
|
ai_prompt = f"""
|
|
Extract relevant information from this content based on the following prompt:
|
|
|
|
PROMPT: {prompt}
|
|
|
|
CONTENT:
|
|
{chunk}
|
|
|
|
Return ONLY the extracted information in a clear, concise format.
|
|
"""
|
|
|
|
# Get AI response
|
|
response = await self.service.base.callAi([
|
|
{"role": "system", "content": "You are an expert at extracting relevant information from documents."},
|
|
{"role": "user", "content": ai_prompt}
|
|
])
|
|
|
|
return response.strip()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing content chunk: {str(e)}")
|
|
return None
|
|
|
|
def _mergeChunkResults(self, chunks: List[str]) -> str:
|
|
"""
|
|
Merge processed content chunks into a single result.
|
|
|
|
Args:
|
|
chunks: List of processed chunks
|
|
|
|
Returns:
|
|
Merged content
|
|
"""
|
|
try:
|
|
# Remove duplicates and empty chunks
|
|
chunks = [chunk for chunk in chunks if chunk and chunk.strip()]
|
|
|
|
# Merge chunks with appropriate spacing
|
|
return "\n\n".join(chunks)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error merging chunk results: {str(e)}")
|
|
return ""
|
|
|
|
# Singleton factory for the document manager
|
|
def getDocumentManager():
|
|
return DocumentManager.getInstance() |