478 lines
No EOL
18 KiB
Python
478 lines
No EOL
18 KiB
Python
from typing import Dict, Any, Optional, List
|
|
import logging
|
|
import json
|
|
import os
|
|
from datetime import datetime, UTC
|
|
from pathlib import Path
|
|
import mimetypes
|
|
import hashlib
|
|
import shutil
|
|
import uuid
|
|
import base64
|
|
|
|
from modules.workflow.processorDocument import DocumentProcessor
|
|
from modules.shared.configuration import APP_CONFIG
|
|
from modules.interfaces.serviceChatModel import ChatDocument, ChatContent
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DocumentManager:
|
|
"""Document manager with enhanced operations and file handling"""
|
|
|
|
_instance = None
|
|
|
|
@classmethod
|
|
def getInstance(cls):
|
|
"""Return a singleton instance of the document manager."""
|
|
if cls._instance is None:
|
|
cls._instance = cls()
|
|
return cls._instance
|
|
|
|
def __init__(self):
|
|
"""Initialize document manager"""
|
|
if DocumentManager._instance is not None:
|
|
raise RuntimeError("Singleton instance already exists - use getInstance()")
|
|
|
|
self.processor = DocumentProcessor()
|
|
self.document_cache = {}
|
|
self.temp_dir = Path(APP_CONFIG.get('temp_dir', 'temp'))
|
|
self.output_dir = Path(APP_CONFIG.get('output_dir', 'output'))
|
|
self.service = None
|
|
|
|
async def initialize(self, context: Dict[str, Any], service=None) -> None:
|
|
"""Initialize document manager with context and service"""
|
|
# Initialize processor
|
|
self.processor.initialize(context)
|
|
|
|
# Initialize service container
|
|
if service:
|
|
# Validate required interfaces
|
|
required_interfaces = ['base', 'msft', 'google']
|
|
missing_interfaces = []
|
|
for interface in required_interfaces:
|
|
if not hasattr(service, interface):
|
|
missing_interfaces.append(interface)
|
|
|
|
if missing_interfaces:
|
|
logger.warning(f"Service container missing required interfaces: {', '.join(missing_interfaces)}")
|
|
return False
|
|
|
|
self.service = service
|
|
|
|
# Create directories if they don't exist
|
|
self.temp_dir.mkdir(parents=True, exist_ok=True)
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Clear temporary directory
|
|
self._clear_temp_directory()
|
|
|
|
def _clear_temp_directory(self) -> None:
|
|
"""Clear temporary directory"""
|
|
try:
|
|
if self.temp_dir.exists():
|
|
shutil.rmtree(self.temp_dir)
|
|
self.temp_dir.mkdir(parents=True)
|
|
except Exception as e:
|
|
logger.error(f"Error clearing temp directory: {str(e)}")
|
|
|
|
async def process_document(self, document: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Process a document with context"""
|
|
try:
|
|
# Generate document ID if not present
|
|
if 'id' not in document:
|
|
document['id'] = self._generate_document_id(document)
|
|
|
|
# Process document content
|
|
processed = await self.processor.process_with_context(document, context)
|
|
|
|
# Add metadata
|
|
processed['metadata'] = {
|
|
'processedAt': datetime.now(UTC).isoformat(),
|
|
'processor': 'DocumentManager',
|
|
'version': '1.0'
|
|
}
|
|
|
|
# Cache document
|
|
self.document_cache[document['id']] = processed
|
|
|
|
return processed
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing document: {str(e)}")
|
|
return {
|
|
'id': document.get('id', ''),
|
|
'error': str(e),
|
|
'status': 'error'
|
|
}
|
|
|
|
async def extract_content(self, file_id: str) -> Optional[ChatDocument]:
|
|
"""Extract content from a file"""
|
|
try:
|
|
# Get file content
|
|
file_content = await self.get_file_content(file_id)
|
|
if not file_content:
|
|
return None
|
|
|
|
# Get file metadata
|
|
file_metadata = await self.get_file_metadata(file_id)
|
|
if not file_metadata:
|
|
return None
|
|
|
|
# Create ChatDocument
|
|
return ChatDocument(
|
|
id=str(uuid.uuid4()),
|
|
fileId=file_id,
|
|
filename=file_metadata.get("name", "Unknown"),
|
|
fileSize=file_metadata.get("size", 0),
|
|
content=file_content.decode('utf-8', errors='ignore'),
|
|
mimeType=file_metadata.get("mimeType", "text/plain")
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content from file {file_id}: {str(e)}")
|
|
return None
|
|
|
|
async def get_file_content(self, file_id: str) -> Optional[bytes]:
|
|
"""Get file content"""
|
|
try:
|
|
if not self.service or not self.service.functions:
|
|
logger.error("Service or functions not initialized")
|
|
return None
|
|
return self.service.functions.getFileData(file_id)
|
|
except Exception as e:
|
|
logger.error(f"Error getting file content for {file_id}: {str(e)}")
|
|
return None
|
|
|
|
async def get_file_metadata(self, file_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Get file metadata"""
|
|
try:
|
|
if not self.service or not self.service.functions:
|
|
logger.error("Service or functions not initialized")
|
|
return None
|
|
return self.service.functions.getFile(file_id)
|
|
except Exception as e:
|
|
logger.error(f"Error getting file metadata for {file_id}: {str(e)}")
|
|
return None
|
|
|
|
async def save_file(self, filename: str, content: bytes, mime_type: str) -> Optional[int]:
|
|
"""Save a new file"""
|
|
try:
|
|
if not self.service or not self.service.base:
|
|
logger.error("Service or base interface not initialized")
|
|
return None
|
|
return await self.service.base.saveFile(filename, content, mime_type)
|
|
except Exception as e:
|
|
logger.error(f"Error saving file {filename}: {str(e)}")
|
|
return None
|
|
|
|
async def delete_file(self, file_id: str) -> bool:
|
|
"""Delete a file"""
|
|
try:
|
|
if not self.service or not self.service.functions:
|
|
logger.error("Service or functions not initialized")
|
|
return False
|
|
return self.service.functions.deleteFile(file_id)
|
|
except Exception as e:
|
|
logger.error(f"Error deleting file {file_id}: {str(e)}")
|
|
return False
|
|
|
|
def convert_file_ref_to_id(self, ref: str) -> Optional[int]:
|
|
"""Convert file reference to ID"""
|
|
try:
|
|
if isinstance(ref, str) and ';' in ref:
|
|
return int(ref.split(';')[1])
|
|
return int(ref)
|
|
except Exception as e:
|
|
logger.error(f"Error converting file reference to ID: {str(e)}")
|
|
return None
|
|
|
|
def convert_file_id_to_ref(self, file_id: str) -> Optional[str]:
|
|
"""Convert file ID to reference"""
|
|
try:
|
|
if not self.service or not self.service.functions:
|
|
logger.error("Service or functions not initialized")
|
|
return None
|
|
|
|
file = self.service.functions.getFile(file_id)
|
|
if not file:
|
|
return None
|
|
return f"{file.filename};{file_id}"
|
|
except Exception as e:
|
|
logger.error(f"Error converting file ID to reference: {str(e)}")
|
|
return None
|
|
|
|
async def convert_data_format(self, data: Any, format: str) -> Any:
|
|
"""Convert data between formats"""
|
|
try:
|
|
if format == 'json':
|
|
if isinstance(data, str):
|
|
return json.loads(data)
|
|
return json.dumps(data)
|
|
elif format == 'base64':
|
|
if isinstance(data, str):
|
|
return base64.b64encode(data.encode('utf-8')).decode('utf-8')
|
|
return base64.b64encode(data).decode('utf-8')
|
|
return data
|
|
except Exception as e:
|
|
logger.error(f"Error converting data format: {str(e)}")
|
|
return data
|
|
|
|
async def create_agent_input_file_list(self, files: List[str]) -> List[Dict[str, Any]]:
|
|
"""Create list of input files for agent processing"""
|
|
try:
|
|
input_files = []
|
|
for file in files:
|
|
file_id = await self.convert_file_ref_to_id(file)
|
|
if file_id:
|
|
file_data = await self.get_file_metadata(file_id)
|
|
if file_data:
|
|
content = await self.get_file_content(file_id)
|
|
input_files.append({
|
|
'id': file_id,
|
|
'name': file_data['name'],
|
|
'mimeType': file_data['mimeType'],
|
|
'content': content
|
|
})
|
|
return input_files
|
|
except Exception as e:
|
|
logger.error(f"Error creating agent input file list: {str(e)}")
|
|
return []
|
|
|
|
async def save_agent_output_files(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Save output files from agent processing"""
|
|
try:
|
|
saved_files = []
|
|
for file in files:
|
|
file_meta = await self.save_file(
|
|
filename=file['name'],
|
|
content=file['content'],
|
|
mimeType=file.get('mimeType', 'application/octet-stream')
|
|
)
|
|
|
|
if file_meta:
|
|
saved_files.append({
|
|
'id': file_meta,
|
|
'name': file['name'],
|
|
'mimeType': file.get('mimeType', 'application/octet-stream')
|
|
})
|
|
return saved_files
|
|
except Exception as e:
|
|
logger.error(f"Error saving agent output files: {str(e)}")
|
|
return []
|
|
|
|
async def content_with_prompt(self, document: Dict[str, Any], prompt: str) -> Optional[Dict[str, Any]]:
|
|
"""Extract content using AI with specific prompt"""
|
|
try:
|
|
# Get document content
|
|
chat_doc = await self.extract_content(document.get('id'))
|
|
if not chat_doc:
|
|
return None
|
|
|
|
# Prepare content
|
|
content = chat_doc.content
|
|
mime_type = chat_doc.mimeType
|
|
|
|
# Process large files in chunks
|
|
if len(content) > 100000:
|
|
chunks = self._split_content_into_chunks(content, mime_type)
|
|
extracted_chunks = []
|
|
|
|
for chunk in chunks:
|
|
chunk_result = await self._process_content_chunk(chunk, prompt)
|
|
if chunk_result:
|
|
extracted_chunks.append(chunk_result)
|
|
|
|
return {
|
|
"content": self._merge_chunk_results(extracted_chunks),
|
|
"metadata": {
|
|
"original_size": len(content),
|
|
"chunks_processed": len(chunks),
|
|
"mime_type": mime_type
|
|
}
|
|
}
|
|
else:
|
|
result = await self._process_content_chunk(content, prompt)
|
|
return {
|
|
"content": result,
|
|
"metadata": {
|
|
"original_size": len(content),
|
|
"chunks_processed": 1,
|
|
"mime_type": mime_type
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in content_with_prompt: {str(e)}")
|
|
return None
|
|
|
|
def _split_content_into_chunks(self, content: str, mime_type: str) -> List[str]:
|
|
"""Split content into manageable chunks"""
|
|
try:
|
|
if mime_type.startswith('text/'):
|
|
return [chunk.strip() for chunk in content.split('\n\n') if chunk.strip()]
|
|
elif mime_type == 'application/json':
|
|
data = json.loads(content)
|
|
if isinstance(data, list):
|
|
return [json.dumps(item) for item in data]
|
|
return [content]
|
|
else:
|
|
return [content[i:i+10000] for i in range(0, len(content), 10000)]
|
|
except Exception as e:
|
|
logger.error(f"Error splitting content: {str(e)}")
|
|
return [content]
|
|
|
|
async def _process_content_chunk(self, chunk: str, prompt: str) -> Optional[str]:
|
|
"""Process content chunk with AI"""
|
|
try:
|
|
if not self.service or not self.service.base:
|
|
logger.error("Service or base interface not initialized")
|
|
return None
|
|
|
|
ai_prompt = f"""
|
|
Extract relevant information from this content based on the following prompt:
|
|
|
|
PROMPT: {prompt}
|
|
|
|
CONTENT:
|
|
{chunk}
|
|
|
|
Return ONLY the extracted information in a clear, concise format.
|
|
"""
|
|
|
|
response = await self.service.base.callAi([
|
|
{"role": "system", "content": "You are an expert at extracting relevant information from documents."},
|
|
{"role": "user", "content": ai_prompt}
|
|
])
|
|
|
|
return response.strip()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing content chunk: {str(e)}")
|
|
return None
|
|
|
|
def _merge_chunk_results(self, chunks: List[str]) -> str:
|
|
"""Merge processed content chunks"""
|
|
try:
|
|
chunks = [chunk for chunk in chunks if chunk and chunk.strip()]
|
|
return "\n\n".join(chunks)
|
|
except Exception as e:
|
|
logger.error(f"Error merging chunk results: {str(e)}")
|
|
return ""
|
|
|
|
async def save_document(self, document: Dict[str, Any], format: str = 'json') -> str:
|
|
"""Save document to output directory"""
|
|
try:
|
|
filename = f"{document['id']}.{format}"
|
|
filepath = self.output_dir / filename
|
|
|
|
if format == 'json':
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(document, f, indent=2)
|
|
else:
|
|
content = document.get('content', '')
|
|
if isinstance(content, str):
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
else:
|
|
with open(filepath, 'wb') as f:
|
|
f.write(content)
|
|
|
|
return str(filepath)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error saving document: {str(e)}")
|
|
raise
|
|
|
|
async def load_document(self, filepath: str) -> Dict[str, Any]:
|
|
"""Load document from file"""
|
|
try:
|
|
path = Path(filepath)
|
|
if not path.exists():
|
|
raise FileNotFoundError(f"Document not found: {filepath}")
|
|
|
|
format = path.suffix[1:].lower()
|
|
|
|
if format == 'json':
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
document = json.load(f)
|
|
else:
|
|
mime_type = mimetypes.guess_type(filepath)[0]
|
|
if mime_type and mime_type.startswith('text/'):
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
else:
|
|
with open(path, 'rb') as f:
|
|
content = f.read()
|
|
|
|
document = {
|
|
'id': path.stem,
|
|
'content': content,
|
|
'format': format,
|
|
'mime_type': mime_type
|
|
}
|
|
|
|
document['metadata'] = {
|
|
'loadedAt': datetime.now(UTC).isoformat(),
|
|
'filepath': str(path),
|
|
'size': path.stat().st_size
|
|
}
|
|
|
|
return document
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error loading document: {str(e)}")
|
|
raise
|
|
|
|
async def convert_document(self, document: Dict[str, Any], target_format: str) -> Dict[str, Any]:
|
|
"""Convert document to target format"""
|
|
try:
|
|
current_format = document.get('format', 'json')
|
|
|
|
if current_format == 'json' and target_format == 'text':
|
|
content = json.dumps(document, indent=2)
|
|
return {
|
|
'id': document['id'],
|
|
'content': content,
|
|
'format': 'text',
|
|
'mime_type': 'text/plain'
|
|
}
|
|
elif current_format == 'text' and target_format == 'json':
|
|
try:
|
|
content = json.loads(document['content'])
|
|
return {
|
|
'id': document['id'],
|
|
'content': content,
|
|
'format': 'json',
|
|
'mime_type': 'application/json'
|
|
}
|
|
except json.JSONDecodeError:
|
|
return {
|
|
'id': document['id'],
|
|
'content': document['content'],
|
|
'format': 'json',
|
|
'mime_type': 'application/json'
|
|
}
|
|
else:
|
|
raise ValueError(f"Unsupported conversion: {current_format} to {target_format}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error converting document: {str(e)}")
|
|
raise
|
|
|
|
def _generate_document_id(self, document: Dict[str, Any]) -> str:
|
|
"""Generate unique document ID"""
|
|
if 'content' in document:
|
|
content = str(document['content'])
|
|
return hashlib.md5(content.encode()).hexdigest()
|
|
return f"doc_{int(datetime.now(UTC).timestamp())}"
|
|
|
|
async def cleanup(self) -> None:
|
|
"""Clean up temporary files and cache"""
|
|
try:
|
|
self._clear_temp_directory()
|
|
self.document_cache.clear()
|
|
except Exception as e:
|
|
logger.error(f"Error during cleanup: {str(e)}")
|
|
|
|
# Singleton factory for the document manager
|
|
def getDocumentManager():
|
|
return DocumentManager.getInstance() |