""" Enhanced document handling module for the Agentservice (continued). """ import os import logging import uuid from datetime import datetime from typing import List, Dict, Any, Optional, Tuple, Union logger = logging.getLogger(__name__) class DocumentHandler: """ Centralized document handler for consistent document management across the system. """ def __init__(self, workflow_id: str = None, lucydom_interface = None, ai_service = None): """Initialize the document handler.""" self.workflow_id = workflow_id self.lucydom_interface = lucydom_interface self.ai_service = ai_service # Import necessary utilities from modules.agentservice_filemanager import get_file_manager self.file_manager = get_file_manager() def set_workflow_id(self, workflow_id: str): """Set or update the workflow ID.""" self.workflow_id = workflow_id def set_lucydom_interface(self, lucydom_interface): """Set or update the LucyDOM interface.""" self.lucydom_interface = lucydom_interface def set_ai_service(self, ai_service): """Set or update the AI service.""" self.ai_service = ai_service async def add_file_to_message(self, message: Dict[str, Any], file_id: int, extraction_prompt: str = None) -> Dict[str, Any]: """ Add a file to a message with contextual extraction. Args: message: The message to add the file to file_id: ID of the file to add extraction_prompt: Optional prompt for contextual extraction (e.g., for images) Returns: Updated message with the file added """ if not self.lucydom_interface: logger.error("LucyDOM interface not available") return message try: # Get file metadata file = self.lucydom_interface.get_file(file_id) if not file: logger.warning(f"File with ID {file_id} not found") return message # Get necessary file information file_name = file.get("name", "unnamed_file") file_type = file.get("type", "unknown") content_type = file.get("content_type") # Initialize documents array if needed if "documents" not in message: message["documents"] = [] # Check if file is already in the message file_already_added = any( doc.get("source", {}).get("id") == str(file_id) for doc in message.get("documents", []) ) if file_already_added: logger.info(f"File {file_name} already exists in message, skipping") return message # Create a unique document ID doc_id = f"doc_{uuid.uuid4()}" # Create document structure document = { "id": doc_id, "source": { "type": "file", "id": str(file_id), "name": file_name, "content_type": content_type, "size": file.get("size"), "upload_date": file.get("upload_date", datetime.now().isoformat()) }, "contents": [] } # Only read content if we have extraction prompt or specific types if (extraction_prompt or file_type in ["document", "text"] or (content_type and content_type.startswith("text/"))): # Read file content file_content = await self.lucydom_interface.read_file_content(file_id) if file_content: # Process based on file type if file_type == "image" or (content_type and content_type.startswith("image/")): # Image analysis if prompt provided if self.ai_service and hasattr(self.ai_service, "analyze_image"): try: # Use provided prompt or default one image_prompt = extraction_prompt or "Describe this image in detail" logger.info(f"Analyzing image {file_name} with prompt: {image_prompt}") image_analysis = await self.ai_service.analyze_image( image_data=file_content, prompt=image_prompt, mime_type=content_type ) # Add the analysis as text content document["contents"].append({ "type": "text", "text": f"Image Analysis:\n{image_analysis}", "is_extracted": True, "extraction_context": extraction_prompt }) logger.info(f"Added image analysis for {file_name} to message") except Exception as e: logger.error(f"Error analyzing image {file_name}: {str(e)}") document["contents"].append({ "type": "text", "text": f"Image file: {file_name} (Analysis failed: {str(e)})", "is_extracted": False }) else: # Just add placeholder if no analysis available document["contents"].append({ "type": "text", "text": f"Image file: {file_name} (no analysis requested)", "is_extracted": False }) # Enhanced PDF processing - extract text and images elif file_name.lower().endswith('.pdf'): logger.info(f"Processing PDF file: {file_name}") # Extract text content first from modules.agentservice_utils import extract_text_from_file_content text_content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) # Add text content document["contents"].append({ "type": "text", "text": text_content, "is_extracted": is_extracted, "extraction_context": extraction_prompt }) logger.info(f"Extracted text content from PDF {file_name}") # Extract and analyze images from PDF if we have AI service if self.ai_service and hasattr(self.ai_service, "analyze_image"): try: # Import necessary modules import fitz # PyMuPDF from io import BytesIO # Add detailed logging logger.info(f"Starting PDF image extraction for {file_name}") # Check if extraction prompt is available or use default image_prompt = extraction_prompt or "Describe this image from the PDF document" # Open PDF from memory stream with detailed error checking try: pdf_document = fitz.open(stream=file_content, filetype="pdf") logger.info(f"Successfully opened PDF with {len(pdf_document)} pages") except Exception as pdf_open_error: logger.error(f"Failed to open PDF: {str(pdf_open_error)}") raise # Initialize images list and image count images_analysis = [] image_count = 0 # Process each page for page_num, page in enumerate(pdf_document, 1): # Get list of images on the page image_list = page.get_images(full=True) if image_list: logger.info(f"Found {len(image_list)} images on page {page_num}") # Process each image for img_index, img in enumerate(image_list): try: xref = img[0] # Get image reference # Extract image data base_image = pdf_document.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] # Analyze image image_analysis = await self.ai_service.analyze_image( image_data=image_bytes, prompt=f"{image_prompt} (Page {page_num}, Image {img_index+1})", mime_type=f"image/{image_ext}" ) # Add to analysis list images_analysis.append({ "page": page_num, "index": img_index + 1, "analysis": image_analysis }) image_count += 1 logger.info(f"Analyzed image {img_index+1} on page {page_num}") # Create a separate document for each extracted image if needed if True: # Set to condition if you want to control this img_doc_id = f"img_doc_{uuid.uuid4()}" image_filename = f"page{page_num}_image{img_index+1}.{image_ext}" image_document = { "id": img_doc_id, "source": { "type": "extracted", "parent_id": str(file_id), "id": img_doc_id, "name": image_filename, "content_type": f"image/{image_ext}", "size": len(image_bytes) }, "contents": [{ "type": "text", "text": f"Image Analysis (PDF Page {page_num}, Image {img_index+1}):\n{image_analysis}", "is_extracted": True, "extraction_context": image_prompt }] } # Add image document to message message["documents"].append(image_document) logger.info(f"Added extracted image document {image_filename} to message") except Exception as img_err: logger.warning(f"Error processing image {img_index} on page {page_num}: {str(img_err)}") # Close the PDF pdf_document.close() # Add combined image analysis to the main document if images_analysis: combined_analysis = "\n\n## Embedded Images Analysis\n\n" for img in images_analysis: combined_analysis += f"### Page {img['page']}, Image {img['index']}\n{img['analysis']}\n\n" document["contents"].append({ "type": "text", "text": combined_analysis, "is_extracted": True, "extraction_context": f"Analysis of {image_count} images embedded in the PDF" }) logger.info(f"Added combined analysis of {image_count} PDF images to document") except ImportError: logger.warning("PyMuPDF (fitz) is not installed, skipping PDF image extraction") document["contents"].append({ "type": "text", "text": "\n\nNote: PDF may contain images that were not extracted due to missing libraries.", "is_extracted": False }) except Exception as e: logger.error(f"Error extracting images from PDF {file_name}: {str(e)}") document["contents"].append({ "type": "text", "text": f"\n\nError extracting images from PDF: {str(e)}", "is_extracted": False }) # Word document processing with image extraction elif file_name.lower().endswith(('.docx', '.doc')): logger.info(f"Processing Word document: {file_name}") # Extract text content first from modules.agentservice_utils import extract_text_from_file_content text_content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) # Add text content document["contents"].append({ "type": "text", "text": text_content, "is_extracted": is_extracted, "extraction_context": extraction_prompt }) logger.info(f"Extracted text content from Word document {file_name}") # Attempt to extract and analyze images from Word document if self.ai_service and hasattr(self.ai_service, "analyze_image"): try: # For .docx documents if file_name.lower().endswith('.docx'): import zipfile from io import BytesIO # Check if extraction prompt is available or use default image_prompt = extraction_prompt or "Describe this image from the Word document" # Create a zipfile object from the .docx content docx_zip = zipfile.ZipFile(BytesIO(file_content)) # Images in .docx are stored in the "word/media" directory image_files = [f for f in docx_zip.namelist() if f.startswith('word/media/')] if image_files: logger.info(f"Found {len(image_files)} images in Word document {file_name}") # Process each image images_analysis = [] for i, img_path in enumerate(image_files): try: # Extract image data image_bytes = docx_zip.read(img_path) # Determine image type from filename image_ext = img_path.split('.')[-1] if '.' in img_path else 'png' # Analyze image image_analysis = await self.ai_service.analyze_image( image_data=image_bytes, prompt=f"{image_prompt} (Image {i+1})", mime_type=f"image/{image_ext}" ) # Add to analysis list images_analysis.append({ "index": i + 1, "path": img_path, "analysis": image_analysis }) logger.info(f"Analyzed image {i+1} ({img_path}) from Word document") # Create a separate document for each extracted image if needed img_doc_id = f"img_doc_{uuid.uuid4()}" image_filename = f"word_image{i+1}.{image_ext}" image_document = { "id": img_doc_id, "source": { "type": "extracted", "parent_id": str(file_id), "id": img_doc_id, "name": image_filename, "content_type": f"image/{image_ext}", "size": len(image_bytes) }, "contents": [{ "type": "text", "text": f"Image Analysis (Word Document Image {i+1}):\n{image_analysis}", "is_extracted": True, "extraction_context": image_prompt }] } # Add image document to message message["documents"].append(image_document) logger.info(f"Added extracted image document {image_filename} to message") except Exception as img_err: logger.warning(f"Error processing image {img_path}: {str(img_err)}") # Add combined image analysis to the main document if images_analysis: combined_analysis = "\n\n## Embedded Images Analysis\n\n" for img in images_analysis: combined_analysis += f"### Image {img['index']}\n{img['analysis']}\n\n" document["contents"].append({ "type": "text", "text": combined_analysis, "is_extracted": True, "extraction_context": f"Analysis of {len(images_analysis)} images embedded in the Word document" }) logger.info(f"Added combined analysis of {len(images_analysis)} Word document images") # Close the zip file docx_zip.close() # Note: For .doc (older format) we would need additional libraries # This could be implemented with libraries like antiword or pywin32 elif file_name.lower().endswith('.doc'): logger.warning("Image extraction from .doc files is not supported yet") document["contents"].append({ "type": "text", "text": "\n\nNote: This is an older .doc format document. Images may be present but could not be extracted.", "is_extracted": False }) except Exception as e: logger.error(f"Error extracting images from Word document {file_name}: {str(e)}") document["contents"].append({ "type": "text", "text": f"\n\nError extracting images from Word document: {str(e)}", "is_extracted": False }) # Excel file processing with enhanced capabilities elif file_name.lower().endswith(('.xlsx', '.xls')): logger.info(f"Processing Excel document: {file_name}") # Extract text representation of spreadsheet data from modules.agentservice_utils import extract_text_from_file_content text_content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) # Add text content document["contents"].append({ "type": "text", "text": text_content, "is_extracted": is_extracted, "extraction_context": extraction_prompt }) logger.info(f"Extracted data from Excel document {file_name}") # Try to extract charts and images if available if self.ai_service and hasattr(self.ai_service, "analyze_image"): try: # For .xlsx files (newer format) if file_name.lower().endswith('.xlsx'): import zipfile from io import BytesIO # Create a zipfile object from the Excel content xlsx_zip = zipfile.ZipFile(BytesIO(file_content)) # Charts and images can be in various directories media_paths = [ 'xl/media/', 'xl/drawings/', 'xl/charts/' ] # Collect all potential media files media_files = [] for path in media_paths: media_files.extend([f for f in xlsx_zip.namelist() if f.startswith(path)]) if media_files: logger.info(f"Found {len(media_files)} media files in Excel document {file_name}") # Process image files (skip XML and other non-image files) image_extensions = ['png', 'jpeg', 'jpg', 'gif', 'bmp', 'tiff', 'emf', 'wmf'] image_files = [f for f in media_files if f.split('.')[-1].lower() in image_extensions] if image_files: logger.info(f"Found {len(image_files)} images/charts in Excel document {file_name}") image_prompt = extraction_prompt or "Describe this chart/image from the Excel document" images_analysis = [] for i, img_path in enumerate(image_files): try: # Extract image data image_bytes = xlsx_zip.read(img_path) # Determine image type from filename image_ext = img_path.split('.')[-1] if '.' in img_path else 'png' # Analyze image image_analysis = await self.ai_service.analyze_image( image_data=image_bytes, prompt=f"{image_prompt} (Describe what this chart or image shows, including any data trends or patterns visible)", mime_type=f"image/{image_ext}" ) # Add to analysis list images_analysis.append({ "index": i + 1, "path": img_path, "analysis": image_analysis }) logger.info(f"Analyzed image/chart {i+1} from Excel document") # Create a separate document for each extracted image img_doc_id = f"img_doc_{uuid.uuid4()}" image_filename = f"excel_image{i+1}.{image_ext}" image_document = { "id": img_doc_id, "source": { "type": "extracted", "parent_id": str(file_id), "id": img_doc_id, "name": image_filename, "content_type": f"image/{image_ext}", "size": len(image_bytes) }, "contents": [{ "type": "text", "text": f"Chart/Image Analysis (Excel Document Item {i+1}):\n{image_analysis}", "is_extracted": True, "extraction_context": image_prompt }] } # Add image document to message message["documents"].append(image_document) except Exception as img_err: logger.warning(f"Error processing image {img_path}: {str(img_err)}") # Add combined image analysis to the main document if images_analysis: combined_analysis = "\n\n## Embedded Charts and Images Analysis\n\n" for img in images_analysis: combined_analysis += f"### Chart/Image {img['index']}\n{img['analysis']}\n\n" document["contents"].append({ "type": "text", "text": combined_analysis, "is_extracted": True, "extraction_context": f"Analysis of {len(images_analysis)} charts/images from the Excel document" }) # Close the zip file xlsx_zip.close() except Exception as e: logger.error(f"Error extracting charts/images from Excel document {file_name}: {str(e)}") else: # For other file types, extract text from modules.agentservice_utils import extract_text_from_file_content content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) document["contents"].append({ "type": "text", "text": content, "is_extracted": is_extracted, "extraction_context": extraction_prompt }) logger.info(f"Added text content for {file_name} to message (extracted: {is_extracted})") else: # No content available document["contents"].append({ "type": "text", "text": f"File content not available for {file_name}", "is_extracted": False }) else: # Just add reference without content document["contents"].append({ "type": "text", "text": f"File: {file_name} (content not loaded)", "is_extracted": False }) # Add document to message message["documents"].append(document) logger.info(f"File {file_name} successfully added to message") return message except Exception as e: logger.error(f"Error adding file {file_id} to message: {str(e)}") return message async def extract_document_content(self, doc_id: str, message: Dict[str, Any], extraction_prompt: str) -> Dict[str, Any]: """ Extract or update document content with contextual extraction. Args: doc_id: ID of the document to extract message: Message containing the document extraction_prompt: Contextual prompt for extraction Returns: Updated message with extracted content """ if not message or "documents" not in message: return message updated_message = message.copy() # Find the document for i, document in enumerate(updated_message.get("documents", [])): if document.get("id") == doc_id: # Get file ID from source source = document.get("source", {}) file_id = source.get("id") if file_id and self.lucydom_interface: # Get file metadata file = self.lucydom_interface.get_file(int(file_id)) if not file: continue # Get file content file_content = await self.lucydom_interface.read_file_content(int(file_id)) if not file_content: continue # Process based on file type file_name = file.get("name", "unnamed_file") file_type = file.get("type", "unknown") content_type = file.get("content_type") # Update content based on file type if file_type == "image" or (content_type and content_type.startswith("image/")): if self.ai_service and hasattr(self.ai_service, "analyze_image"): try: image_analysis = await self.ai_service.analyze_image( image_data=file_content, prompt=extraction_prompt, mime_type=content_type ) # Create or update content new_content = { "type": "text", "text": f"Image Analysis:\n{image_analysis}", "is_extracted": True, "extraction_context": extraction_prompt } # Update or add content contents = document.get("contents", []) contents_updated = False for j, content in enumerate(contents): if content.get("type") == "text": updated_message["documents"][i]["contents"][j] = new_content contents_updated = True break if not contents_updated: if not updated_message["documents"][i].get("contents"): updated_message["documents"][i]["contents"] = [] updated_message["documents"][i]["contents"].append(new_content) logger.info(f"Updated image analysis for {file_name} with new context: {extraction_prompt}") except Exception as e: logger.error(f"Error updating image analysis for {file_name}: {str(e)}") else: # For other file types, extract text with new context from modules.agentservice_utils import extract_text_from_file_content content, is_extracted = extract_text_from_file_content( file_content, file_name, content_type ) new_content = { "type": "text", "text": content, "is_extracted": is_extracted, "extraction_context": extraction_prompt } # Update or add content contents = document.get("contents", []) contents_updated = False for j, content_item in enumerate(contents): if content_item.get("type") == "text": updated_message["documents"][i]["contents"][j] = new_content contents_updated = True break if not contents_updated: if not updated_message["documents"][i].get("contents"): updated_message["documents"][i]["contents"] = [] updated_message["documents"][i]["contents"].append(new_content) logger.info(f"Updated text extraction for {file_name} with new context: {extraction_prompt}") # Found and processed the document, stop searching break return updated_message async def extract_files_from_workflow(self, workflow: Dict[str, Any], extraction_prompt: str, file_filter: str = None) -> Dict[str, Any]: """ Extract all relevant files from a workflow with context-aware extraction. Args: workflow: The workflow object extraction_prompt: Contextual prompt for extraction file_filter: Optional filter for file types (e.g., "csv", "image") Returns: Dictionary with extracted content """ # Import for data extraction from modules.agentservice_dataextraction import data_extraction # Get all files from the workflow files = [] # Process all messages for message in workflow.get("messages", []): # Extract documents from the message for doc in message.get("documents", []): source = doc.get("source", {}) # Only include file documents if source.get("type") == "file": file_info = { "id": source.get("id", ""), "name": source.get("name", ""), "type": source.get("type", ""), "content_type": source.get("content_type", ""), "size": source.get("size", 0) } # Apply filter if provided if file_filter: file_name = file_info.get("name", "").lower() content_type = file_info.get("content_type", "").lower() if (file_filter.lower() in file_name or file_filter.lower() in content_type): # Check if file is already in the list if not any(f.get("id") == file_info["id"] for f in files): files.append(file_info) else: # No filter, include all files if not any(f.get("id") == file_info["id"] for f in files): files.append(file_info) # If no files found, return empty result if not files: return { "prompt": extraction_prompt, "files_processed": 0, "extracted_content": [] } # Get all messages from the workflow workflow_messages = workflow.get("messages", []) # Extract data using the dataextraction module extracted_data = await data_extraction( prompt=extraction_prompt, files=files, messages=workflow_messages, ai_service=self.ai_service, lucydom_interface=self.lucydom_interface, workflow_id=self.workflow_id, add_log_func=None # We don't have access to add_log_func here ) return extracted_data def get_file_content_from_message(self, message: Dict[str, Any], file_id: int = None, doc_id: str = None) -> str: """ Get file content from a message. Args: message: The message containing the document file_id: Optional file ID to search for doc_id: Optional document ID to search for Returns: Text content of the file if available """ if not message or "documents" not in message: return "" # Search for the document for document in message.get("documents", []): # Match by document ID or file ID source = document.get("source", {}) source_file_id = source.get("id") if ((doc_id and document.get("id") == doc_id) or (file_id and source_file_id and str(file_id) == str(source_file_id))): # Get text content from document for content in document.get("contents", []): if content.get("type") == "text": return content.get("text", "") return "" def create_text_document(self, message: Dict[str, Any], content: str, title: str = "Generated Text") -> Dict[str, Any]: """ Create a new text document in a message. Args: message: The message to add the document to content: Text content title: Document title Returns: Updated message with the new document """ # Initialize documents array if needed updated_message = message.copy() if "documents" not in updated_message: updated_message["documents"] = [] # Create document ID doc_id = f"doc_{uuid.uuid4()}" # Create document structure document = { "id": doc_id, "source": { "type": "generated", "id": doc_id, "name": title, "content_type": "text/plain", "size": len(content) }, "contents": [ { "type": "text", "text": content, "is_extracted": True } ] } # Add document to message updated_message["documents"].append(document) logger.info(f"Created text document '{title}' in message") return updated_message def merge_document_contents(self, message: Dict[str, Any]) -> str: """ Merge all document contents from a message into a single text. Args: message: The message containing documents Returns: Combined text content from all documents """ if not message or "documents" not in message: return "" combined_text = "" for document in message.get("documents", []): source = document.get("source", {}) doc_name = source.get("name", "Unnamed Document") # Extract text content doc_text = "" for content in document.get("contents", []): if content.get("type") == "text": doc_text = content.get("text", "") break if doc_text: combined_text += f"\n\n--- {doc_name} ---\n\n{doc_text}" return combined_text.strip() # Factory function def get_document_handler(workflow_id: str = None, lucydom_interface = None, ai_service = None) -> DocumentHandler: """Get a document handler instance.""" return DocumentHandler(workflow_id, lucydom_interface, ai_service)