from typing import Dict, Any, Optional, List import logging import json import os from datetime import datetime, UTC from pathlib import Path import mimetypes import hashlib import shutil import uuid import base64 from modules.workflow.processorDocument import DocumentProcessor from modules.shared.configuration import APP_CONFIG from modules.interfaces.serviceChatModel import ChatDocument, ChatContent logger = logging.getLogger(__name__) class DocumentManager: """Document manager with enhanced operations and file handling""" _instance = None @classmethod def getInstance(cls): """Return a singleton instance of the document manager.""" if cls._instance is None: cls._instance = cls() return cls._instance def __init__(self): """Initialize document manager""" if DocumentManager._instance is not None: raise RuntimeError("Singleton instance already exists - use getInstance()") self.processor = DocumentProcessor() self.document_cache = {} self.temp_dir = Path(APP_CONFIG.get('temp_dir', 'temp')) self.output_dir = Path(APP_CONFIG.get('output_dir', 'output')) self.service = None async def initialize(self, context: Dict[str, Any], service=None) -> None: """Initialize document manager with context and service""" # Initialize processor self.processor.initialize(context) # Initialize service container if service: # Validate required interfaces required_interfaces = ['base', 'msft', 'google'] missing_interfaces = [] for interface in required_interfaces: if not hasattr(service, interface): missing_interfaces.append(interface) if missing_interfaces: logger.warning(f"Service container missing required interfaces: {', '.join(missing_interfaces)}") return False self.service = service # Create directories if they don't exist self.temp_dir.mkdir(parents=True, exist_ok=True) self.output_dir.mkdir(parents=True, exist_ok=True) # Clear temporary directory self._clear_temp_directory() def _clear_temp_directory(self) -> None: """Clear temporary directory""" try: if self.temp_dir.exists(): shutil.rmtree(self.temp_dir) self.temp_dir.mkdir(parents=True) except Exception as e: logger.error(f"Error clearing temp directory: {str(e)}") async def process_document(self, document: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: """Process a document with context""" try: # Generate document ID if not present if 'id' not in document: document['id'] = self._generate_document_id(document) # Process document content processed = await self.processor.process_with_context(document, context) # Add metadata processed['metadata'] = { 'processedAt': datetime.now(UTC).isoformat(), 'processor': 'DocumentManager', 'version': '1.0' } # Cache document self.document_cache[document['id']] = processed return processed except Exception as e: logger.error(f"Error processing document: {str(e)}") return { 'id': document.get('id', ''), 'error': str(e), 'status': 'error' } async def extract_content(self, file_id: str) -> Optional[ChatDocument]: """Extract content from a file""" try: # Get file content file_content = await self.get_file_content(file_id) if not file_content: return None # Get file metadata file_metadata = await self.get_file_metadata(file_id) if not file_metadata: return None # Create ChatDocument return ChatDocument( id=str(uuid.uuid4()), fileId=file_id, filename=file_metadata.get("name", "Unknown"), fileSize=file_metadata.get("size", 0), content=file_content.decode('utf-8', errors='ignore'), mimeType=file_metadata.get("mimeType", "text/plain") ) except Exception as e: logger.error(f"Error extracting content from file {file_id}: {str(e)}") return None async def get_file_content(self, file_id: str) -> Optional[bytes]: """Get file content""" try: if not self.service or not self.service.functions: logger.error("Service or functions not initialized") return None return self.service.functions.getFileData(file_id) except Exception as e: logger.error(f"Error getting file content for {file_id}: {str(e)}") return None async def get_file_metadata(self, file_id: str) -> Optional[Dict[str, Any]]: """Get file metadata""" try: if not self.service or not self.service.functions: logger.error("Service or functions not initialized") return None return self.service.functions.getFile(file_id) except Exception as e: logger.error(f"Error getting file metadata for {file_id}: {str(e)}") return None async def save_file(self, filename: str, content: bytes, mime_type: str) -> Optional[int]: """Save a new file""" try: if not self.service or not self.service.base: logger.error("Service or base interface not initialized") return None return await self.service.base.saveFile(filename, content, mime_type) except Exception as e: logger.error(f"Error saving file {filename}: {str(e)}") return None async def delete_file(self, file_id: str) -> bool: """Delete a file""" try: if not self.service or not self.service.functions: logger.error("Service or functions not initialized") return False return self.service.functions.deleteFile(file_id) except Exception as e: logger.error(f"Error deleting file {file_id}: {str(e)}") return False def convert_file_ref_to_id(self, ref: str) -> Optional[int]: """Convert file reference to ID""" try: if isinstance(ref, str) and ';' in ref: return int(ref.split(';')[1]) return int(ref) except Exception as e: logger.error(f"Error converting file reference to ID: {str(e)}") return None def convert_file_id_to_ref(self, file_id: str) -> Optional[str]: """Convert file ID to reference""" try: if not self.service or not self.service.functions: logger.error("Service or functions not initialized") return None file = self.service.functions.getFile(file_id) if not file: return None return f"{file.filename};{file_id}" except Exception as e: logger.error(f"Error converting file ID to reference: {str(e)}") return None async def convert_data_format(self, data: Any, format: str) -> Any: """Convert data between formats""" try: if format == 'json': if isinstance(data, str): return json.loads(data) return json.dumps(data) elif format == 'base64': if isinstance(data, str): return base64.b64encode(data.encode('utf-8')).decode('utf-8') return base64.b64encode(data).decode('utf-8') return data except Exception as e: logger.error(f"Error converting data format: {str(e)}") return data async def create_agent_input_file_list(self, files: List[str]) -> List[Dict[str, Any]]: """Create list of input files for agent processing""" try: input_files = [] for file in files: file_id = await self.convert_file_ref_to_id(file) if file_id: file_data = await self.get_file_metadata(file_id) if file_data: content = await self.get_file_content(file_id) input_files.append({ 'id': file_id, 'name': file_data['name'], 'mimeType': file_data['mimeType'], 'content': content }) return input_files except Exception as e: logger.error(f"Error creating agent input file list: {str(e)}") return [] async def save_agent_output_files(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Save output files from agent processing""" try: saved_files = [] for file in files: file_meta = await self.save_file( filename=file['name'], content=file['content'], mimeType=file.get('mimeType', 'application/octet-stream') ) if file_meta: saved_files.append({ 'id': file_meta, 'name': file['name'], 'mimeType': file.get('mimeType', 'application/octet-stream') }) return saved_files except Exception as e: logger.error(f"Error saving agent output files: {str(e)}") return [] async def content_with_prompt(self, document: Dict[str, Any], prompt: str) -> Optional[Dict[str, Any]]: """Extract content using AI with specific prompt""" try: # Get document content chat_doc = await self.extract_content(document.get('id')) if not chat_doc: return None # Prepare content content = chat_doc.content mime_type = chat_doc.mimeType # Process large files in chunks if len(content) > 100000: chunks = self._split_content_into_chunks(content, mime_type) extracted_chunks = [] for chunk in chunks: chunk_result = await self._process_content_chunk(chunk, prompt) if chunk_result: extracted_chunks.append(chunk_result) return { "content": self._merge_chunk_results(extracted_chunks), "metadata": { "original_size": len(content), "chunks_processed": len(chunks), "mime_type": mime_type } } else: result = await self._process_content_chunk(content, prompt) return { "content": result, "metadata": { "original_size": len(content), "chunks_processed": 1, "mime_type": mime_type } } except Exception as e: logger.error(f"Error in content_with_prompt: {str(e)}") return None def _split_content_into_chunks(self, content: str, mime_type: str) -> List[str]: """Split content into manageable chunks""" try: if mime_type.startswith('text/'): return [chunk.strip() for chunk in content.split('\n\n') if chunk.strip()] elif mime_type == 'application/json': data = json.loads(content) if isinstance(data, list): return [json.dumps(item) for item in data] return [content] else: return [content[i:i+10000] for i in range(0, len(content), 10000)] except Exception as e: logger.error(f"Error splitting content: {str(e)}") return [content] async def _process_content_chunk(self, chunk: str, prompt: str) -> Optional[str]: """Process content chunk with AI""" try: if not self.service or not self.service.base: logger.error("Service or base interface not initialized") return None ai_prompt = f""" Extract relevant information from this content based on the following prompt: PROMPT: {prompt} CONTENT: {chunk} Return ONLY the extracted information in a clear, concise format. """ response = await self.service.base.callAi([ {"role": "system", "content": "You are an expert at extracting relevant information from documents."}, {"role": "user", "content": ai_prompt} ]) return response.strip() except Exception as e: logger.error(f"Error processing content chunk: {str(e)}") return None def _merge_chunk_results(self, chunks: List[str]) -> str: """Merge processed content chunks""" try: chunks = [chunk for chunk in chunks if chunk and chunk.strip()] return "\n\n".join(chunks) except Exception as e: logger.error(f"Error merging chunk results: {str(e)}") return "" async def save_document(self, document: Dict[str, Any], format: str = 'json') -> str: """Save document to output directory""" try: filename = f"{document['id']}.{format}" filepath = self.output_dir / filename if format == 'json': with open(filepath, 'w', encoding='utf-8') as f: json.dump(document, f, indent=2) else: content = document.get('content', '') if isinstance(content, str): with open(filepath, 'w', encoding='utf-8') as f: f.write(content) else: with open(filepath, 'wb') as f: f.write(content) return str(filepath) except Exception as e: logger.error(f"Error saving document: {str(e)}") raise async def load_document(self, filepath: str) -> Dict[str, Any]: """Load document from file""" try: path = Path(filepath) if not path.exists(): raise FileNotFoundError(f"Document not found: {filepath}") format = path.suffix[1:].lower() if format == 'json': with open(path, 'r', encoding='utf-8') as f: document = json.load(f) else: mime_type = mimetypes.guess_type(filepath)[0] if mime_type and mime_type.startswith('text/'): with open(path, 'r', encoding='utf-8') as f: content = f.read() else: with open(path, 'rb') as f: content = f.read() document = { 'id': path.stem, 'content': content, 'format': format, 'mime_type': mime_type } document['metadata'] = { 'loadedAt': datetime.now(UTC).isoformat(), 'filepath': str(path), 'size': path.stat().st_size } return document except Exception as e: logger.error(f"Error loading document: {str(e)}") raise async def convert_document(self, document: Dict[str, Any], target_format: str) -> Dict[str, Any]: """Convert document to target format""" try: current_format = document.get('format', 'json') if current_format == 'json' and target_format == 'text': content = json.dumps(document, indent=2) return { 'id': document['id'], 'content': content, 'format': 'text', 'mime_type': 'text/plain' } elif current_format == 'text' and target_format == 'json': try: content = json.loads(document['content']) return { 'id': document['id'], 'content': content, 'format': 'json', 'mime_type': 'application/json' } except json.JSONDecodeError: return { 'id': document['id'], 'content': document['content'], 'format': 'json', 'mime_type': 'application/json' } else: raise ValueError(f"Unsupported conversion: {current_format} to {target_format}") except Exception as e: logger.error(f"Error converting document: {str(e)}") raise def _generate_document_id(self, document: Dict[str, Any]) -> str: """Generate unique document ID""" if 'content' in document: content = str(document['content']) return hashlib.md5(content.encode()).hexdigest() return f"doc_{int(datetime.now(UTC).timestamp())}" async def cleanup(self) -> None: """Clean up temporary files and cache""" try: self._clear_temp_directory() self.document_cache.clear() except Exception as e: logger.error(f"Error during cleanup: {str(e)}") # Singleton factory for the document manager def getDocumentManager(): return DocumentManager.getInstance()