""" Enhanced document handling module for the Agentservice (continued). """ import os import logging import uuid from datetime import datetime from typing import List, Dict, Any, Optional, Tuple, Union logger = logging.getLogger(__name__) class DocumentHandler: """ Centralized document handler for consistent document management across the system. """ def __init__(self, workflow_id: str = None, lucydom_interface = None, ai_service = None): """Initialize the document handler.""" self.workflow_id = workflow_id self.lucydom_interface = lucydom_interface self.ai_service = ai_service # Import necessary utilities from modules.agentservice_filemanager import get_file_manager self.file_manager = get_file_manager() def set_workflow_id(self, workflow_id: str): """Set or update the workflow ID.""" self.workflow_id = workflow_id def set_lucydom_interface(self, lucydom_interface): """Set or update the LucyDOM interface.""" self.lucydom_interface = lucydom_interface def set_ai_service(self, ai_service): """Set or update the AI service.""" self.ai_service = ai_service async def add_file_to_message(self, message: Dict[str, Any], file_id: int, extraction_prompt: str = None) -> Dict[str, Any]: """ Add a file to a message with optional contextual extraction. Args: message: The message to add the file to file_id: ID of the file to add extraction_prompt: Optional prompt for contextual extraction (e.g., for images) Returns: Updated message with the file added """ if not self.lucydom_interface: logger.error("LucyDOM interface not available") return message try: # Get file metadata file = self.lucydom_interface.get_file(file_id) if not file: logger.warning(f"File with ID {file_id} not found") return message # Get necessary file information file_name = file.get("name", "unnamed_file") file_type = file.get("type", "unknown") content_type = file.get("content_type") # Initialize documents array if needed if "documents" not in message: message["documents"] = [] # Check if file is already in the message file_already_added = any( doc.get("source", {}).get("id") == str(file_id) for doc in message.get("documents", []) ) if file_already_added: logger.info(f"File {file_name} already exists in message, skipping") return message # Create a unique document ID doc_id = f"doc_{uuid.uuid4()}" # Create document structure document = { "id": doc_id, "source": { "type": "file", "id": str(file_id), "name": file_name, "content_type": content_type, "size": file.get("size"), "upload_date": file.get("upload_date", datetime.now().isoformat()) }, "contents": [] } # Only read content if we have extraction prompt or specific types if (extraction_prompt or file_type in ["document", "text"] or (content_type and content_type.startswith("text/"))): # Read file content file_content = await self.lucydom_interface.read_file_content(file_id) if file_content: # Process based on file type if file_type == "image" or (content_type and content_type.startswith("image/")): # Image analysis if prompt provided if extraction_prompt and self.ai_service and hasattr(self.ai_service, "analyze_image"): try: image_analysis = await self.ai_service.analyze_image( image_data=file_content, prompt=extraction_prompt or "Describe this image in detail", mime_type=content_type ) # Add the analysis as text content document["contents"].append({ "type": "text", "text": f"Image Analysis:\n{image_analysis}", "is_extracted": True, "extraction_context": extraction_prompt }) logger.info(f"Added image analysis for {file_name} to message") except Exception as e: logger.error(f"Error analyzing image {file_name}: {str(e)}") document["contents"].append({ "type": "text", "text": f"Image file: {file_name} (Analysis failed: {str(e)})", "is_extracted": False }) else: # Just add placeholder if no analysis available document["contents"].append({ "type": "text", "text": f"Image file: {file_name} (no analysis requested)", "is_extracted": False }) else: # For other file types, extract text from modules.agentservice_utils import extract_text_from_file_content content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) document["contents"].append({ "type": "text", "text": content, "is_extracted": is_extracted, "extraction_context": extraction_prompt }) logger.info(f"Added text content for {file_name} to message (extracted: {is_extracted})") else: # No content available document["contents"].append({ "type": "text", "text": f"File content not available for {file_name}", "is_extracted": False }) else: # Just add reference without content document["contents"].append({ "type": "text", "text": f"File: {file_name} (content not loaded)", "is_extracted": False }) # Add document to message message["documents"].append(document) logger.info(f"File {file_name} successfully added to message") return message except Exception as e: logger.error(f"Error adding file {file_id} to message: {str(e)}") return message async def add_files_to_message(self, message: Dict[str, Any], file_ids: List[int], extraction_prompt: str = None) -> Dict[str, Any]: """ Add multiple files to a message. Args: message: The message to add files to file_ids: List of file IDs to add extraction_prompt: Optional prompt for contextual extraction Returns: Updated message with files added """ updated_message = message.copy() for file_id in file_ids: updated_message = await self.add_file_to_message(updated_message, file_id, extraction_prompt) return updated_message async def extract_document_content(self, doc_id: str, message: Dict[str, Any], extraction_prompt: str) -> Dict[str, Any]: """ Extract or update document content with contextual extraction. Args: doc_id: ID of the document to extract message: Message containing the document extraction_prompt: Contextual prompt for extraction Returns: Updated message with extracted content """ if not message or "documents" not in message: return message updated_message = message.copy() # Find the document for i, document in enumerate(updated_message.get("documents", [])): if document.get("id") == doc_id: # Get file ID from source source = document.get("source", {}) file_id = source.get("id") if file_id and self.lucydom_interface: # Get file metadata file = self.lucydom_interface.get_file(int(file_id)) if not file: continue # Get file content file_content = await self.lucydom_interface.read_file_content(int(file_id)) if not file_content: continue # Process based on file type file_name = file.get("name", "unnamed_file") file_type = file.get("type", "unknown") content_type = file.get("content_type") # Update content based on file type if file_type == "image" or (content_type and content_type.startswith("image/")): if self.ai_service and hasattr(self.ai_service, "analyze_image"): try: image_analysis = await self.ai_service.analyze_image( image_data=file_content, prompt=extraction_prompt, mime_type=content_type ) # Create or update content new_content = { "type": "text", "text": f"Image Analysis:\n{image_analysis}", "is_extracted": True, "extraction_context": extraction_prompt } # Update or add content contents = document.get("contents", []) contents_updated = False for j, content in enumerate(contents): if content.get("type") == "text": updated_message["documents"][i]["contents"][j] = new_content contents_updated = True break if not contents_updated: if not updated_message["documents"][i].get("contents"): updated_message["documents"][i]["contents"] = [] updated_message["documents"][i]["contents"].append(new_content) logger.info(f"Updated image analysis for {file_name} with new context: {extraction_prompt}") except Exception as e: logger.error(f"Error updating image analysis for {file_name}: {str(e)}") else: # For other file types, extract text with new context from modules.agentservice_utils import extract_text_from_file_content content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) new_content = { "type": "text", "text": content, "is_extracted": is_extracted, "extraction_context": extraction_prompt } # Update or add content contents = document.get("contents", []) contents_updated = False for j, content_item in enumerate(contents): if content_item.get("type") == "text": updated_message["documents"][i]["contents"][j] = new_content contents_updated = True break if not contents_updated: if not updated_message["documents"][i].get("contents"): updated_message["documents"][i]["contents"] = [] updated_message["documents"][i]["contents"].append(new_content) logger.info(f"Updated text extraction for {file_name} with new context: {extraction_prompt}") # Found and processed the document, stop searching break return updated_message async def extract_files_from_workflow(self, workflow: Dict[str, Any], extraction_prompt: str, file_filter: str = None) -> Dict[str, Any]: """ Extract all relevant files from a workflow with context-aware extraction. Args: workflow: The workflow object extraction_prompt: Contextual prompt for extraction file_filter: Optional filter for file types (e.g., "csv", "image") Returns: Dictionary with extracted content """ # Import for data extraction from modules.agentservice_dataextraction import data_extraction # Get all files from the workflow files = [] # Process all messages for message in workflow.get("messages", []): # Extract documents from the message for doc in message.get("documents", []): source = doc.get("source", {}) # Only include file documents if source.get("type") == "file": file_info = { "id": source.get("id", ""), "name": source.get("name", ""), "type": source.get("type", ""), "content_type": source.get("content_type", ""), "size": source.get("size", 0) } # Apply filter if provided if file_filter: file_name = file_info.get("name", "").lower() content_type = file_info.get("content_type", "").lower() if (file_filter.lower() in file_name or file_filter.lower() in content_type): # Check if file is already in the list if not any(f.get("id") == file_info["id"] for f in files): files.append(file_info) else: # No filter, include all files if not any(f.get("id") == file_info["id"] for f in files): files.append(file_info) # If no files found, return empty result if not files: return { "prompt": extraction_prompt, "files_processed": 0, "extracted_content": [] } # Get all messages from the workflow workflow_messages = workflow.get("messages", []) # Extract data using the dataextraction module extracted_data = await data_extraction( prompt=extraction_prompt, files=files, messages=workflow_messages, ai_service=self.ai_service, lucydom_interface=self.lucydom_interface, workflow_id=self.workflow_id, add_log_func=None # We don't have access to add_log_func here ) return extracted_data def get_file_content_from_message(self, message: Dict[str, Any], file_id: int = None, doc_id: str = None) -> str: """ Get file content from a message. Args: message: The message containing the document file_id: Optional file ID to search for doc_id: Optional document ID to search for Returns: Text content of the file if available """ if not message or "documents" not in message: return "" # Search for the document for document in message.get("documents", []): # Match by document ID or file ID source = document.get("source", {}) source_file_id = source.get("id") if ((doc_id and document.get("id") == doc_id) or (file_id and source_file_id and str(file_id) == str(source_file_id))): # Get text content from document for content in document.get("contents", []): if content.get("type") == "text": return content.get("text", "") return "" def create_text_document(self, message: Dict[str, Any], content: str, title: str = "Generated Text") -> Dict[str, Any]: """ Create a new text document in a message. Args: message: The message to add the document to content: Text content title: Document title Returns: Updated message with the new document """ # Initialize documents array if needed updated_message = message.copy() if "documents" not in updated_message: updated_message["documents"] = [] # Create document ID doc_id = f"doc_{uuid.uuid4()}" # Create document structure document = { "id": doc_id, "source": { "type": "generated", "id": doc_id, "name": title, "content_type": "text/plain", "size": len(content) }, "contents": [ { "type": "text", "text": content, "is_extracted": True } ] } # Add document to message updated_message["documents"].append(document) logger.info(f"Created text document '{title}' in message") return updated_message def merge_document_contents(self, message: Dict[str, Any]) -> str: """ Merge all document contents from a message into a single text. Args: message: The message containing documents Returns: Combined text content from all documents """ if not message or "documents" not in message: return "" combined_text = "" for document in message.get("documents", []): source = document.get("source", {}) doc_name = source.get("name", "Unnamed Document") # Extract text content doc_text = "" for content in document.get("contents", []): if content.get("type") == "text": doc_text = content.get("text", "") break if doc_text: combined_text += f"\n\n--- {doc_name} ---\n\n{doc_text}" return combined_text.strip() # Factory function def get_document_handler(workflow_id: str = None, lucydom_interface = None, ai_service = None) -> DocumentHandler: """Get a document handler instance.""" return DocumentHandler(workflow_id, lucydom_interface, ai_service)