""" Central file management module for the Agentservice. """ import os import logging import base64 import json import uuid from datetime import datetime from typing import List, Dict, Any, Optional, Tuple, Union, BinaryIO from io import BytesIO # Import utilities from agentservice_utils from modules.agentservice_utils import extract_text_from_file_content, is_text_extractable logger = logging.getLogger(__name__) # Helper function for adding logs def _log(add_log_func, workflow_id, message, level="info"): """Helper function for adding logs with standardized formatting.""" if add_log_func and workflow_id: add_log_func(workflow_id, message, level) # Also log to standard logger if level == "info": logger.info(message) elif level == "warning": logger.warning(message) elif level == "error": logger.error(message) class FileExtractionError(Exception): """Exception for file extraction errors.""" pass class FileManager: """Central file management for the Agentservice.""" _instance = None @classmethod def get_instance(cls): """Get the singleton instance of FileManager.""" if cls._instance is None: cls._instance = cls() return cls._instance def __init__(self): """Initialize the FileManager.""" # Ensure singleton pattern if FileManager._instance is not None: raise RuntimeError("Singleton instance already exists - use get_instance()") # Import utilities # Instead of storing file_utils, we'll use the imported functions directly async def read_file_contents(self, file_contexts: List[Dict[str, Any]], lucydom_interface, workflow_id: str = None, add_log_func = None, ai_service = None, extraction_context: str = None # Add this parameter ) -> Dict[str, Dict[str, Any]]: """ Read file contents with optional contextual extraction. Args: file_contexts: List of file contexts with metadata lucydom_interface: LucyDOM interface for file access workflow_id: Optional workflow ID for logging add_log_func: Optional function for adding logs ai_service: AI service for image analysis extraction_context: Optional context prompt for extraction Returns: Dictionary with file contents and metadata """ file_contents = {} # Add debug logging logger.info(f"Reading contents of {len(file_contexts)} files for workflow {workflow_id}") for file in file_contexts: file_id = file["id"] file_name = file["name"] file_type = file.get("type", "unknown") content_type = file.get("content_type") try: # Dateiinhalt über LucyDOM-Interface abrufen file_data = await lucydom_interface.read_file_content(file_id) if not file_data: _log(add_log_func, workflow_id, f"Datei {file_name} nicht gefunden", "warning") file_contents[file_id] = { "content": f"File content not available (File not found)", "is_extracted": False, "name": file_name, "type": file_type, "content_type": content_type } continue logger.info(f"Successfully read file: {file_name} (ID: {file_id}, Type: {file_type})") # For image analysis, add extraction context if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')): if ai_service and hasattr(ai_service, "analyze_image"): try: # Use extraction context if provided prompt = extraction_context or "Describe this image in detail" image_analysis = await ai_service.analyze_image( image_data=file_data, prompt=prompt, # Use contextual prompt mime_type=content_type ) file_contents[file_id] = { "content": f"Image Analysis:\n{image_analysis}", "is_extracted": True, # Mark as extracted "name": file_name, "type": file_type, "content_type": content_type, "extraction_context": prompt # Store the used prompt } _log(add_log_func, workflow_id, f"Image {file_name} analyzed successfully", "info") except Exception as e: logger.error(f"Error analyzing image {file_name}: {str(e)}") _log(add_log_func, workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error") file_contents[file_id] = { "content": f"Image file: {file_name} (Analysis failed: {str(e)})", "is_extracted": False, "name": file_name, "type": file_type, "content_type": content_type } else: file_contents[file_id] = { "content": f"Image file: {file_name} (AI analysis not available)", "is_extracted": False, "name": file_name, "type": file_type, "content_type": content_type } # Dokument- und Textdateien elif (file_type == "document" or not file_type or file_name.lower().endswith(('.csv', '.txt', '.json', '.xml')) or (content_type and content_type.startswith('text/'))): # Verwende die zentrale Textextraktionsfunktion mit Dateiinhalt content, is_extracted = extract_text_from_file_content( file_data, file_name, content_type ) file_contents[file_id] = { "content": content, "is_extracted": is_extracted, "name": file_name, "type": file_type, "content_type": content_type } _log(add_log_func, workflow_id, f"File {file_name} read successfully (extracted: {is_extracted})", "info") # Andere Dateitypen - nur Metadaten speichern else: file_contents[file_id] = { "content": f"File: {file_name} (Type: {file_type}, content not available)", "is_extracted": False, "name": file_name, "type": file_type, "content_type": content_type } _log(add_log_func, workflow_id, f"Unsupported file type: {file_type} for {file_name}", "warning") except Exception as e: logger.error(f"Error reading file {file_name}: {str(e)}") _log(add_log_func, workflow_id, f"Error reading file {file_name}: {str(e)}", "error") file_contents[file_id] = { "content": f"File content not available (Error: {str(e)})", "is_extracted": False, "name": file_name, "type": file_type, "content_type": content_type } return file_contents @staticmethod def add_file_to_message(message: Dict[str, Any], file_data: Dict[str, Any]) -> Dict[str, Any]: """ Add a file to a message with consistent document structure. Args: message: The message to add the file to file_data: File metadata and content Returns: Updated message with the file added """ logger.info(f"Adding file to message: {file_data.get('name', 'unnamed_file')} (ID: {file_data.get('id', 'unknown')})") # Initialize documents array if needed if "documents" not in message: message["documents"] = [] # Create a unique ID for the document if not provided doc_id = file_data.get("id", f"file_{uuid.uuid4()}") # Extract metadata file_size = file_data.get("size") if isinstance(file_size, str) and file_size.isdigit(): file_size = int(file_size) elif file_size is None and file_data.get("content"): file_size = len(file_data.get("content", "")) # Determine if content is already extracted content = file_data.get("content", "No content available") file_name = file_data.get("name", "unnamed_file") content_type = file_data.get("content_type") is_extracted = file_data.get("is_extracted", False) # Create standard document structure that follows the data model document = { "id": f"doc_{uuid.uuid4()}", # Unique document ID separate from file ID "source": { "type": "file", "id": doc_id, "name": file_name, "content_type": content_type, "size": file_size, "upload_date": file_data.get("upload_date", datetime.now().isoformat()) }, "contents": [ { "type": "text", "text": content, "is_extracted": is_extracted, "extraction_context": file_data.get("extraction_context", None) } ] } # Check if file is already in the message file_already_added = any( doc.get("source", {}).get("id") == doc_id for doc in message.get("documents", []) ) if not file_already_added: message["documents"].append(document) logger.info(f"File {file_name} added to message (total: {len(message.get('documents', []))} files)") else: logger.info(f"File {file_name} already exists in message, skipping") return message async def analyze_file(self, file_id: int, prompt: str, lucydom_interface, ai_service) -> Dict[str, Any]: """ Analyze a file using the appropriate method based on file type. Args: file_id: ID of the file to analyze prompt: Analysis prompt lucydom_interface: Interface for database access ai_service: Service for AI requests Returns: Analysis result """ if not lucydom_interface: raise ValueError("LucyDOM interface not available") if not ai_service: raise ValueError("AI service not available") try: # Get file metadata file = lucydom_interface.get_file(file_id) if not file: raise ValueError(f"File with ID {file_id} not found") # Get file content file_content = await lucydom_interface.read_file_content(file_id) if not file_content: raise ValueError(f"Content for file {file_id} not found") # Extract metadata file_name = file.get("name", "unnamed") content_type = file.get("content_type") file_type = file.get("type") # Process based on file type if file_type == "image" or (content_type and content_type.startswith("image/")): # Image analysis if hasattr(ai_service, "analyze_image"): analysis = await ai_service.analyze_image( image_data=file_content, prompt=prompt, mime_type=content_type ) return { "file_id": file_id, "file_name": file_name, "analysis_type": "image", "result": analysis } else: raise ValueError("AI service does not support image analysis") elif file_name.endswith(".pdf"): # PDF analysis - first extract text, then analyze try: # Extract text text_content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) if not is_extracted: raise ValueError(f"Failed to extract text from PDF {file_name}") # Analyze text with AI pdf_analysis_prompt = f""" Analyze the following PDF content based on this request: REQUEST: {prompt} PDF CONTENT: {text_content} # In a future release to split into tokensets, if too big file """ analysis = await ai_service.call_api([{"role": "user", "content": pdf_analysis_prompt}]) # Also check for images in the PDF has_images = False image_analysis = None try: # Extract and analyze images image_results = await self.extract_and_analyze_pdf_images( file_content, f"Analyze images with respect to: {prompt}", ai_service ) if image_results and len(image_results) > 0: has_images = True image_analysis = "\n\nPDF IMAGES ANALYSIS:\n" for img in image_results: image_analysis += f"- Image on page {img.get('page')}: {img.get('response')}\n" except Exception as img_err: logger.warning(f"Could not analyze images in PDF {file_name}: {str(img_err)}") # Combine text and image analysis if available if has_images and image_analysis: analysis += image_analysis return { "file_id": file_id, "file_name": file_name, "analysis_type": "pdf", "result": analysis, "has_images": has_images } except Exception as pdf_err: logger.error(f"Error analyzing PDF {file_name}: {str(pdf_err)}") raise elif file_name.endswith(('.xlsx', '.xls', '.csv')): # Tabular data analysis try: # Extract text content text_content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) if not is_extracted: raise ValueError(f"Failed to extract data from {file_name}") # Analyze with AI data_analysis_prompt = f""" Analyze the following tabular data based on this request: REQUEST: {prompt} DATA CONTENT: {text_content} # In a future release to split into tokensets to limit storage Provide a structured analysis including: 1. Data overview 2. Key insights 3. Patterns and trends 4. Answers to the specific request """ analysis = await ai_service.call_api([{"role": "user", "content": data_analysis_prompt}]) return { "file_id": file_id, "file_name": file_name, "analysis_type": "tabular_data", "result": analysis } except Exception as data_err: logger.error(f"Error analyzing tabular data {file_name}: {str(data_err)}") raise else: # Default to text analysis for all other file types try: # Extract text content text_content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) if not is_extracted: raise ValueError(f"Failed to extract text from {file_name}") # Analyze with AI text_analysis_prompt = f""" Analyze the following document content based on this request: REQUEST: {prompt} DOCUMENT CONTENT: {text_content} # In a future release to split into tokensets """ analysis = await ai_service.call_api([{"role": "user", "content": text_analysis_prompt}]) return { "file_id": file_id, "file_name": file_name, "analysis_type": "text", "result": analysis } except Exception as text_err: logger.error(f"Error analyzing text content {file_name}: {str(text_err)}") raise except Exception as e: logger.error(f"Error analyzing file {file_id}: {str(e)}") raise async def extract_and_analyze_pdf_images(self, pdf_content: bytes, prompt: str, ai_service ) -> List[Dict[str, Any]]: """ Extract images from a PDF file and analyze them. Works with binary data instead of file paths. Args: pdf_content: Binary data of the PDF file prompt: Prompt for image analysis ai_service: AI service for image analysis Returns: List with analysis results for each image """ image_responses = [] temp_files = [] # List of temporary files for cleanup try: # Import required libraries try: import fitz # PyMuPDF from io import BytesIO import tempfile logger.info(f"Starting PDF image extraction with PyMuPDF") except ImportError: logger.error("PyMuPDF (fitz) is not installed. Install it with 'pip install pymupdf'") return [] # Open PDF in memory try: doc = fitz.open(stream=pdf_content, filetype="pdf") page_count = len(doc) logger.info(f"PDF opened with {page_count} pages") except Exception as pdf_err: logger.error(f"Error opening PDF: {str(pdf_err)}") return [] # Process each page with multiple extraction methods for page_num, page in enumerate(doc, 1): logger.info(f"Processing page {page_num}/{page_count}") # Method 1: Standard extraction using get_images try: image_list = page.get_images(full=True) if image_list: logger.info(f"Method 1: Found {len(image_list)} images on page {page_num}") for img_index, img in enumerate(image_list): try: xref = img[0] # Get image reference # Extract image data base_image = doc.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] # Check for valid image data if not image_bytes or len(image_bytes) < 100: logger.warning(f"Empty or very small image data for image {img_index+1} on page {page_num}") continue # Analyze image analysis_result = await ai_service.analyze_image( image_data=image_bytes, prompt=prompt, mime_type=f"image/{image_ext}" ) # Store image size image_size = f"{base_image.get('width', 0)}x{base_image.get('height', 0)}" # Add result image_responses.append({ "page": page_num, "image_index": img_index, "format": image_ext, "image_size": image_size, "method": "get_images", "response": analysis_result }) logger.info(f"Successfully analyzed image {img_index+1} on page {page_num} using method 1") except Exception as e: logger.warning(f"Error processing image {img_index} on page {page_num} (Method 1): {str(e)}") else: logger.info(f"Method 1: No images found on page {page_num} using get_images") except Exception as m1_err: logger.warning(f"Error in Method 1 for page {page_num}: {str(m1_err)}") # Method 2: Extract embedded images using page.get_drawings() try: drawings = page.get_drawings() drawing_images = 0 for drawing_index, drawing in enumerate(drawings): try: # Check if drawing contains an image if "image" in str(drawing).lower(): drawing_images += 1 rect = drawing["rect"] # Get rectangle of the drawing # Extract the area as an image pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=rect) img_bytes = pix.tobytes("png") # Analyze the image analysis_result = await ai_service.analyze_image( image_data=img_bytes, prompt=f"{prompt} (Page {page_num}, Drawing {drawing_index+1})", mime_type="image/png" ) # Add result image_responses.append({ "page": page_num, "image_index": drawing_index, "format": "png", "image_size": f"{pix.width}x{pix.height}", "method": "get_drawings", "response": analysis_result }) logger.info(f"Successfully analyzed drawing image {drawing_index+1} on page {page_num} using method 2") except Exception as drawing_err: logger.warning(f"Error processing drawing {drawing_index} on page {page_num}: {str(drawing_err)}") if drawing_images > 0: logger.info(f"Method 2: Processed {drawing_images} images from drawings on page {page_num}") else: logger.info(f"Method 2: No images found in drawings on page {page_num}") except Exception as m2_err: logger.warning(f"Error in Method 2 for page {page_num}: {str(m2_err)}") # Method 3: Extract using blocks detection try: blocks = page.get_text("dict")["blocks"] img_blocks = [b for b in blocks if b.get("type") == 1] # type 1 = image if img_blocks: logger.info(f"Method 3: Found {len(img_blocks)} image blocks on page {page_num}") for block_index, block in enumerate(img_blocks): try: # Extract using pixmap for the block region rect = block["bbox"] pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=rect) img_bytes = pix.tobytes("png") # Analyze image analysis_result = await ai_service.analyze_image( image_data=img_bytes, prompt=f"{prompt} (Page {page_num}, Block {block_index+1})", mime_type="image/png" ) # Add result image_responses.append({ "page": page_num, "image_index": block_index, "format": "png", "image_size": f"{pix.width}x{pix.height}", "method": "block_extraction", "response": analysis_result }) logger.info(f"Successfully analyzed image block {block_index+1} on page {page_num} using method 3") except Exception as block_err: logger.warning(f"Error processing block {block_index} on page {page_num}: {str(block_err)}") else: logger.info(f"Method 3: No image blocks found on page {page_num}") except Exception as m3_err: logger.warning(f"Error in Method 3 for page {page_num}: {str(m3_err)}") # Method 4: Last resort - render the entire page as an image and analyze if not image_responses or not any(resp.get("page") == page_num for resp in image_responses): try: logger.info(f"Method 4: Rendering entire page {page_num} as image") # Render the entire page as an image pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) img_bytes = pix.tobytes("png") # Analyze the page as an image analysis_result = await ai_service.analyze_image( image_data=img_bytes, prompt=f"{prompt} (Full page {page_num})", mime_type="image/png" ) # Add result image_responses.append({ "page": page_num, "image_index": 0, "format": "png", "image_size": f"{pix.width}x{pix.height}", "method": "full_page_render", "response": analysis_result }) logger.info(f"Successfully analyzed full page {page_num} as image using method 4") except Exception as m4_err: logger.warning(f"Error in Method 4 for page {page_num}: {str(m4_err)}") # Close the document doc.close() # Deduplicate results (different methods might extract the same image) deduplicated_responses = [] seen_areas = set() for response in image_responses: # Create a unique identifier for the image area area_key = f"{response['page']}_{response['image_size']}" if area_key not in seen_areas: seen_areas.add(area_key) deduplicated_responses.append(response) logger.info(f"PDF image extraction complete: Found {len(image_responses)} images, deduplicated to {len(deduplicated_responses)}") return deduplicated_responses except ImportError as imp_err: logger.error(f"Required library not available for PDF image extraction: {str(imp_err)}") return [] except Exception as e: logger.error(f"Error extracting images from PDF: {str(e)}") return [] finally: # Clean up temporary files for temp_file in temp_files: try: if os.path.exists(temp_file): os.remove(temp_file) except Exception as e: logger.warning(f"Could not remove temporary file: {temp_file} - {str(e)}") async def analyze_multiple_files( self, file_ids: List[int], prompt: str, lucydom_interface, ai_service ) -> Dict[str, Any]: """ Analyze multiple files and synthesize a combined result. Args: file_ids: List of file IDs to analyze prompt: Analysis prompt lucydom_interface: Interface for database access ai_service: Service for AI requests Returns: Combined analysis result """ results = [] # Analyze each file for file_id in file_ids: try: analysis = await self.analyze_file(file_id, prompt, lucydom_interface, ai_service) results.append(analysis) except Exception as e: logger.error(f"Error analyzing file {file_id}: {str(e)}") results.append({ "file_id": file_id, "error": str(e), "analysis_type": "error" }) # Now synthesize a combined analysis if results: try: # Prepare prompt for synthesis synthesis_prompt = f""" Synthesize a combined analysis based on these individual file analyses: ORIGINAL REQUEST: {prompt} INDIVIDUAL ANALYSES: """ for i, result in enumerate(results, 1): file_name = result.get("file_name", f"File {i}") analysis_type = result.get("analysis_type", "unknown") analysis_result = result.get("result", "No analysis available") synthesis_prompt += f""" ## {file_name} ({analysis_type}) {analysis_result} --- """ synthesis_prompt += """ Please provide a comprehensive synthesis that: 1. Combines insights from all files 2. Addresses the original request 3. Highlights connections between different files 4. Provides a unified conclusion """ # Call AI for synthesis synthesis = await ai_service.call_api([{"role": "user", "content": synthesis_prompt}]) return { "synthesis": synthesis, "individual_results": results, "files_analyzed": len(results) } except Exception as e: logger.error(f"Error synthesizing combined analysis: {str(e)}") return { "error": str(e), "individual_results": results, "files_analyzed": len(results) } else: return { "synthesis": "No files were successfully analyzed.", "individual_results": [], "files_analyzed": 0 } def determine_file_type(self, file_name: str, content_type: str = None) -> str: """ Determine the file type based on name and content type. Args: file_name: Name of the file content_type: MIME type (optional) Returns: File type string ('document', 'image', etc.) """ # Check content type first if content_type: if content_type.startswith('image/'): return "image" elif content_type in ['application/pdf']: return "document" elif content_type in ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'text/csv']: return "spreadsheet" # Check file extension lower_name = file_name.lower() # Images if lower_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg')): return "image" # Documents if lower_name.endswith(('.pdf', '.doc', '.docx', '.txt', '.md', '.rtf')): return "document" # Spreadsheets if lower_name.endswith(('.xlsx', '.xls', '.csv')): return "spreadsheet" # Presentations if lower_name.endswith(('.pptx', '.ppt')): return "presentation" # Data files if lower_name.endswith(('.json', '.xml', '.yaml', '.yml')): return "data" # Default to document return "document" def get_mime_type(self, file_name: str) -> str: """Get MIME type based on file name.""" # Import from lucydom_interface from lucydom_interface import LucyDOMInterface temp_interface = LucyDOMInterface(0, 0) # Default values return temp_interface.get_mime_type(file_name) def prepare_file_contexts(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Bereitet die Dateikontexte basierend auf Metadaten vor. Akzeptiert keine Pfade mehr, sondern nur Metadaten aus der Datenbank. Args: files: Liste von Dateien mit Metadaten (Dict mit id, name, type, content_type) Returns: Liste von Dateikontexten für die Verarbeitung """ file_contexts = [] logger.info(f"Preparing file contexts for {len(files)} files") for file in files: file_id = file.get("id") file_name = file.get("name") file_type = file.get("type") # Create a comprehensive context with all available metadata context = { "id": file_id, "name": file_name, "type": file_type, "size": file.get("size", "Unbekannt"), "content_type": file.get("content_type"), "path": file.get("path"), "upload_date": file.get("upload_date"), "hash": file.get("hash"), "mandate_id": file.get("mandate_id"), "user_id": file.get("user_id") } # Log for debugging logger.info(f"Created file context: {file_name} (ID: {file_id}, Type: {file_type})") file_contexts.append(context) return file_contexts def create_document_reference(self, message: Dict[str, Any], file_id: int, reference_type: str = "reference") -> Dict[str, Any]: """ Create a document reference without loading content. Args: message: The message to add the reference to file_id: ID of the file to reference reference_type: Type of reference (reference, citation, etc.) Returns: Updated message with the document reference """ if not self.lucydom_interface: logger.warning("LucyDOM interface not available for document reference") return message # Get file metadata file = self.lucydom_interface.get_file(file_id) if not file: logger.warning(f"File with ID {file_id} not found for reference") return message # Create document structure with just the reference document = { "id": f"ref_{uuid.uuid4()}", "source": { "type": "file", "id": str(file_id), "name": file.get("name", "referenced_file"), "content_type": file.get("content_type"), "size": file.get("size"), "reference_type": reference_type }, "contents": [] # Empty contents - will be loaded on demand } # Add to message updated_message = message.copy() if "documents" not in updated_message: updated_message["documents"] = [] updated_message["documents"].append(document) logger.info(f"Added document reference for file {file.get('name')} (ID: {file_id})") return updated_message def should_extract_document(self, document: Dict[str, Any], context_prompt: str = None) -> bool: """ Determine if a document needs content extraction. Args: document: The document object context_prompt: Current context prompt Returns: True if extraction is needed, False otherwise """ # If document has no contents, extraction is needed if not document.get("contents"): return True # If document has contents but extraction status is False, extraction may be needed for content in document.get("contents", []): if content.get("type") == "text": # If already extracted, check if context has changed if content.get("is_extracted", False): # If context prompt is different from what was used previously, # we may need to re-extract with the new context prev_context = content.get("extraction_context") if context_prompt and prev_context != context_prompt: return True return False return True # Default to needing extraction return True # Factory method @staticmethod def get_instance(): """Get the singleton instance of FileManager.""" if FileManager._instance is None: FileManager._instance = FileManager() return FileManager._instance # Create a singleton instance for module-level access file_manager = FileManager.get_instance() def get_file_manager(): """Get the singleton instance of FileManager.""" return file_manager class WorkflowFileManager: """ Specialized file manager for workflow operations. Handles workflow-specific file operations and document management. """ def __init__(self, workflow_id: str = None, lucydom_interface = None): """ Initialize the workflow file manager. Args: workflow_id: Optional workflow ID for context lucydom_interface: LucyDOM interface for database operations """ self.workflow_id = workflow_id self.lucydom_interface = lucydom_interface self.file_manager = get_file_manager() self.document_handler = None def set_workflow_id(self, workflow_id: str): """Set or update the workflow ID.""" self.workflow_id = workflow_id def set_lucydom_interface(self, lucydom_interface): """Set or update the LucyDOM interface.""" self.lucydom_interface = lucydom_interface async def add_files_to_message(self, message: Dict[str, Any], file_ids: List[int], add_log_func = None) -> Dict[str, Any]: """ Add multiple files to a message. Args: message: The message to add files to file_ids: List of file IDs to add add_log_func: Optional logging function Returns: Updated message """ # If document handler is available, use it if self.document_handler: return await self.document_handler.add_files_to_message( message, file_ids, extraction_prompt=None # Default to no extraction ) if not self.lucydom_interface: _log(add_log_func, self.workflow_id, "LucyDOM interface not available", "error") return message updated_message = message.copy() # Get file metadata files = [] for file_id in file_ids: file = self.lucydom_interface.get_file(file_id) if file: files.append(file) else: _log(add_log_func, self.workflow_id, f"File not found: {file_id}", "warning") # Prepare file contexts file_contexts = self.file_manager.prepare_file_contexts(files) # Read file contents file_contents = await self.file_manager.read_file_contents( file_contexts, self.lucydom_interface, self.workflow_id, add_log_func ) # Add files to message for file_id, content_data in file_contents.items(): # Add file to message updated_message = FileManager.add_file_to_message(updated_message, content_data) return updated_message def get_files_from_message(self, message: Dict[str, Any]) -> List[Dict[str, Any]]: """ Extract file references from a message. Args: message: The message to extract files from Returns: List of file metadata """ files = [] # Process documents for doc in message.get("documents", []): source = doc.get("source", {}) # Only include file documents if source.get("type") == "file": file_info = { "id": source.get("id", ""), "name": source.get("name", ""), "type": source.get("content_type", ""), "content_type": source.get("content_type", ""), "size": source.get("size", 0) } files.append(file_info) return files def get_document_text_content(self, message: Dict[str, Any]) -> str: """ Extract text content from all documents in a message. Args: message: The message to extract content from Returns: Combined text content """ content = "" # Process all documents for doc in message.get("documents", []): for doc_content in doc.get("contents", []): if doc_content.get("type") == "text": content += "\n\n" + doc_content.get("text", "") return content async def extract_document_info(self, workflow: Dict[str, Any], message_id: str = None) -> Dict[str, Any]: """ Extract document information from a workflow or specific message. Args: workflow: The workflow object message_id: Optional message ID to focus on a specific message Returns: Document information """ result = { "documents": [], "file_count": 0, "extracted_text": "" } if message_id: # Process only the specified message for message in workflow.get("messages", []): if message.get("id") == message_id: files = self.get_files_from_message(message) result["documents"].extend(files) result["file_count"] = len(files) result["extracted_text"] = self.get_document_text_content(message) break else: # Process all messages for message in workflow.get("messages", []): files = self.get_files_from_message(message) result["documents"].extend(files) result["extracted_text"] += self.get_document_text_content(message) # De-duplicate files unique_files = {} for file in result["documents"]: file_id = file.get("id") if file_id and file_id not in unique_files: unique_files[file_id] = file result["documents"] = list(unique_files.values()) result["file_count"] = len(result["documents"]) return result async def analyze_workflow_documents(self, workflow: Dict[str, Any], prompt: str, ai_service, message_id: str = None) -> Dict[str, Any]: """ Analyze documents in a workflow. Args: workflow: The workflow object prompt: Analysis prompt ai_service: Service for AI analysis message_id: Optional message ID to focus on specific message Returns: Analysis result """ if not self.lucydom_interface: raise ValueError("LucyDOM interface not available") if not ai_service: raise ValueError("AI service not available") # Extract document info doc_info = await self.extract_document_info(workflow, message_id) if doc_info["file_count"] == 0: return { "result": "No documents found for analysis", "files_analyzed": 0 } # Get file IDs file_ids = [doc.get("id") for doc in doc_info["documents"] if doc.get("id")] # Analyze files analysis = await self.file_manager.analyze_multiple_files( file_ids, prompt, self.lucydom_interface, ai_service ) return analysis # Export the workflow file manager factory function def get_workflow_file_manager(workflow_id: str = None, lucydom_interface = None): """Get a workflow file manager instance.""" return WorkflowFileManager(workflow_id, lucydom_interface)