gateway/modules/workflow/managerDocument.py
2025-06-10 01:25:32 +02:00

478 lines
No EOL
18 KiB
Python

from typing import Dict, Any, Optional, List
import logging
import json
import os
from datetime import datetime, UTC
from pathlib import Path
import mimetypes
import hashlib
import shutil
import uuid
import base64
from modules.workflow.processorDocument import DocumentProcessor
from modules.shared.configuration import APP_CONFIG
from modules.interfaces.serviceChatModel import ChatDocument, ChatContent
logger = logging.getLogger(__name__)
class DocumentManager:
"""Document manager with enhanced operations and file handling"""
_instance = None
@classmethod
def getInstance(cls):
"""Return a singleton instance of the document manager."""
if cls._instance is None:
cls._instance = cls()
return cls._instance
def __init__(self):
"""Initialize document manager"""
if DocumentManager._instance is not None:
raise RuntimeError("Singleton instance already exists - use getInstance()")
self.processor = DocumentProcessor()
self.document_cache = {}
self.temp_dir = Path(APP_CONFIG.get('temp_dir', 'temp'))
self.output_dir = Path(APP_CONFIG.get('output_dir', 'output'))
self.service = None
async def initialize(self, context: Dict[str, Any], service=None) -> None:
"""Initialize document manager with context and service"""
# Initialize processor
self.processor.initialize(context)
# Initialize service container
if service:
# Validate required interfaces
required_interfaces = ['base', 'msft', 'google']
missing_interfaces = []
for interface in required_interfaces:
if not hasattr(service, interface):
missing_interfaces.append(interface)
if missing_interfaces:
logger.warning(f"Service container missing required interfaces: {', '.join(missing_interfaces)}")
return False
self.service = service
# Create directories if they don't exist
self.temp_dir.mkdir(parents=True, exist_ok=True)
self.output_dir.mkdir(parents=True, exist_ok=True)
# Clear temporary directory
self._clear_temp_directory()
def _clear_temp_directory(self) -> None:
"""Clear temporary directory"""
try:
if self.temp_dir.exists():
shutil.rmtree(self.temp_dir)
self.temp_dir.mkdir(parents=True)
except Exception as e:
logger.error(f"Error clearing temp directory: {str(e)}")
async def process_document(self, document: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""Process a document with context"""
try:
# Generate document ID if not present
if 'id' not in document:
document['id'] = self._generate_document_id(document)
# Process document content
processed = await self.processor.process_with_context(document, context)
# Add metadata
processed['metadata'] = {
'processedAt': datetime.now(UTC).isoformat(),
'processor': 'DocumentManager',
'version': '1.0'
}
# Cache document
self.document_cache[document['id']] = processed
return processed
except Exception as e:
logger.error(f"Error processing document: {str(e)}")
return {
'id': document.get('id', ''),
'error': str(e),
'status': 'error'
}
async def extract_content(self, file_id: str) -> Optional[ChatDocument]:
"""Extract content from a file"""
try:
# Get file content
file_content = await self.get_file_content(file_id)
if not file_content:
return None
# Get file metadata
file_metadata = await self.get_file_metadata(file_id)
if not file_metadata:
return None
# Create ChatDocument
return ChatDocument(
id=str(uuid.uuid4()),
fileId=file_id,
filename=file_metadata.get("name", "Unknown"),
fileSize=file_metadata.get("size", 0),
content=file_content.decode('utf-8', errors='ignore'),
mimeType=file_metadata.get("mimeType", "text/plain")
)
except Exception as e:
logger.error(f"Error extracting content from file {file_id}: {str(e)}")
return None
async def get_file_content(self, file_id: str) -> Optional[bytes]:
"""Get file content"""
try:
if not self.service or not self.service.functions:
logger.error("Service or functions not initialized")
return None
return self.service.functions.getFileData(file_id)
except Exception as e:
logger.error(f"Error getting file content for {file_id}: {str(e)}")
return None
async def get_file_metadata(self, file_id: str) -> Optional[Dict[str, Any]]:
"""Get file metadata"""
try:
if not self.service or not self.service.functions:
logger.error("Service or functions not initialized")
return None
return self.service.functions.getFile(file_id)
except Exception as e:
logger.error(f"Error getting file metadata for {file_id}: {str(e)}")
return None
async def save_file(self, filename: str, content: bytes, mime_type: str) -> Optional[int]:
"""Save a new file"""
try:
if not self.service or not self.service.base:
logger.error("Service or base interface not initialized")
return None
return await self.service.base.saveFile(filename, content, mime_type)
except Exception as e:
logger.error(f"Error saving file {filename}: {str(e)}")
return None
async def delete_file(self, file_id: str) -> bool:
"""Delete a file"""
try:
if not self.service or not self.service.functions:
logger.error("Service or functions not initialized")
return False
return self.service.functions.deleteFile(file_id)
except Exception as e:
logger.error(f"Error deleting file {file_id}: {str(e)}")
return False
def convert_file_ref_to_id(self, ref: str) -> Optional[int]:
"""Convert file reference to ID"""
try:
if isinstance(ref, str) and ';' in ref:
return int(ref.split(';')[1])
return int(ref)
except Exception as e:
logger.error(f"Error converting file reference to ID: {str(e)}")
return None
def convert_file_id_to_ref(self, file_id: str) -> Optional[str]:
"""Convert file ID to reference"""
try:
if not self.service or not self.service.functions:
logger.error("Service or functions not initialized")
return None
file = self.service.functions.getFile(file_id)
if not file:
return None
return f"{file.filename};{file_id}"
except Exception as e:
logger.error(f"Error converting file ID to reference: {str(e)}")
return None
async def convert_data_format(self, data: Any, format: str) -> Any:
"""Convert data between formats"""
try:
if format == 'json':
if isinstance(data, str):
return json.loads(data)
return json.dumps(data)
elif format == 'base64':
if isinstance(data, str):
return base64.b64encode(data.encode('utf-8')).decode('utf-8')
return base64.b64encode(data).decode('utf-8')
return data
except Exception as e:
logger.error(f"Error converting data format: {str(e)}")
return data
async def create_agent_input_file_list(self, files: List[str]) -> List[Dict[str, Any]]:
"""Create list of input files for agent processing"""
try:
input_files = []
for file in files:
file_id = await self.convert_file_ref_to_id(file)
if file_id:
file_data = await self.get_file_metadata(file_id)
if file_data:
content = await self.get_file_content(file_id)
input_files.append({
'id': file_id,
'name': file_data['name'],
'mimeType': file_data['mimeType'],
'content': content
})
return input_files
except Exception as e:
logger.error(f"Error creating agent input file list: {str(e)}")
return []
async def save_agent_output_files(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Save output files from agent processing"""
try:
saved_files = []
for file in files:
file_meta = await self.save_file(
filename=file['name'],
content=file['content'],
mimeType=file.get('mimeType', 'application/octet-stream')
)
if file_meta:
saved_files.append({
'id': file_meta,
'name': file['name'],
'mimeType': file.get('mimeType', 'application/octet-stream')
})
return saved_files
except Exception as e:
logger.error(f"Error saving agent output files: {str(e)}")
return []
async def content_with_prompt(self, document: Dict[str, Any], prompt: str) -> Optional[Dict[str, Any]]:
"""Extract content using AI with specific prompt"""
try:
# Get document content
chat_doc = await self.extract_content(document.get('id'))
if not chat_doc:
return None
# Prepare content
content = chat_doc.content
mime_type = chat_doc.mimeType
# Process large files in chunks
if len(content) > 100000:
chunks = self._split_content_into_chunks(content, mime_type)
extracted_chunks = []
for chunk in chunks:
chunk_result = await self._process_content_chunk(chunk, prompt)
if chunk_result:
extracted_chunks.append(chunk_result)
return {
"content": self._merge_chunk_results(extracted_chunks),
"metadata": {
"original_size": len(content),
"chunks_processed": len(chunks),
"mime_type": mime_type
}
}
else:
result = await self._process_content_chunk(content, prompt)
return {
"content": result,
"metadata": {
"original_size": len(content),
"chunks_processed": 1,
"mime_type": mime_type
}
}
except Exception as e:
logger.error(f"Error in content_with_prompt: {str(e)}")
return None
def _split_content_into_chunks(self, content: str, mime_type: str) -> List[str]:
"""Split content into manageable chunks"""
try:
if mime_type.startswith('text/'):
return [chunk.strip() for chunk in content.split('\n\n') if chunk.strip()]
elif mime_type == 'application/json':
data = json.loads(content)
if isinstance(data, list):
return [json.dumps(item) for item in data]
return [content]
else:
return [content[i:i+10000] for i in range(0, len(content), 10000)]
except Exception as e:
logger.error(f"Error splitting content: {str(e)}")
return [content]
async def _process_content_chunk(self, chunk: str, prompt: str) -> Optional[str]:
"""Process content chunk with AI"""
try:
if not self.service or not self.service.base:
logger.error("Service or base interface not initialized")
return None
ai_prompt = f"""
Extract relevant information from this content based on the following prompt:
PROMPT: {prompt}
CONTENT:
{chunk}
Return ONLY the extracted information in a clear, concise format.
"""
response = await self.service.base.callAi([
{"role": "system", "content": "You are an expert at extracting relevant information from documents."},
{"role": "user", "content": ai_prompt}
])
return response.strip()
except Exception as e:
logger.error(f"Error processing content chunk: {str(e)}")
return None
def _merge_chunk_results(self, chunks: List[str]) -> str:
"""Merge processed content chunks"""
try:
chunks = [chunk for chunk in chunks if chunk and chunk.strip()]
return "\n\n".join(chunks)
except Exception as e:
logger.error(f"Error merging chunk results: {str(e)}")
return ""
async def save_document(self, document: Dict[str, Any], format: str = 'json') -> str:
"""Save document to output directory"""
try:
filename = f"{document['id']}.{format}"
filepath = self.output_dir / filename
if format == 'json':
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(document, f, indent=2)
else:
content = document.get('content', '')
if isinstance(content, str):
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
else:
with open(filepath, 'wb') as f:
f.write(content)
return str(filepath)
except Exception as e:
logger.error(f"Error saving document: {str(e)}")
raise
async def load_document(self, filepath: str) -> Dict[str, Any]:
"""Load document from file"""
try:
path = Path(filepath)
if not path.exists():
raise FileNotFoundError(f"Document not found: {filepath}")
format = path.suffix[1:].lower()
if format == 'json':
with open(path, 'r', encoding='utf-8') as f:
document = json.load(f)
else:
mime_type = mimetypes.guess_type(filepath)[0]
if mime_type and mime_type.startswith('text/'):
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
else:
with open(path, 'rb') as f:
content = f.read()
document = {
'id': path.stem,
'content': content,
'format': format,
'mime_type': mime_type
}
document['metadata'] = {
'loadedAt': datetime.now(UTC).isoformat(),
'filepath': str(path),
'size': path.stat().st_size
}
return document
except Exception as e:
logger.error(f"Error loading document: {str(e)}")
raise
async def convert_document(self, document: Dict[str, Any], target_format: str) -> Dict[str, Any]:
"""Convert document to target format"""
try:
current_format = document.get('format', 'json')
if current_format == 'json' and target_format == 'text':
content = json.dumps(document, indent=2)
return {
'id': document['id'],
'content': content,
'format': 'text',
'mime_type': 'text/plain'
}
elif current_format == 'text' and target_format == 'json':
try:
content = json.loads(document['content'])
return {
'id': document['id'],
'content': content,
'format': 'json',
'mime_type': 'application/json'
}
except json.JSONDecodeError:
return {
'id': document['id'],
'content': document['content'],
'format': 'json',
'mime_type': 'application/json'
}
else:
raise ValueError(f"Unsupported conversion: {current_format} to {target_format}")
except Exception as e:
logger.error(f"Error converting document: {str(e)}")
raise
def _generate_document_id(self, document: Dict[str, Any]) -> str:
"""Generate unique document ID"""
if 'content' in document:
content = str(document['content'])
return hashlib.md5(content.encode()).hexdigest()
return f"doc_{int(datetime.now(UTC).timestamp())}"
async def cleanup(self) -> None:
"""Clean up temporary files and cache"""
try:
self._clear_temp_directory()
self.document_cache.clear()
except Exception as e:
logger.error(f"Error during cleanup: {str(e)}")
# Singleton factory for the document manager
def getDocumentManager():
return DocumentManager.getInstance()