"""
Enhanced document handling module for the Agentservice (continued).
"""

import os
import logging
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union

logger = logging.getLogger(__name__)

class DocumentHandler:
    """
    Centralized document handler for consistent document management across the system.
    """
    
    def __init__(self, workflow_id: str = None, lucydom_interface = None, ai_service = None):
        """Initialize the document handler."""
        self.workflow_id = workflow_id
        self.lucydom_interface = lucydom_interface
        self.ai_service = ai_service
        
        # Import necessary utilities
        from modules.agentservice_filemanager import get_file_manager
        self.file_manager = get_file_manager()
    
    def set_workflow_id(self, workflow_id: str):
        """Set or update the workflow ID."""
        self.workflow_id = workflow_id
    
    def set_lucydom_interface(self, lucydom_interface):
        """Set or update the LucyDOM interface."""
        self.lucydom_interface = lucydom_interface
    
    def set_ai_service(self, ai_service):
        """Set or update the AI service."""
        self.ai_service = ai_service
    

    async def add_file_to_message(self, message: Dict[str, Any], file_id: int, extraction_prompt: str = None) -> Dict[str, Any]:
        """
        Add a file to a message with contextual extraction.
        
        Args:
            message: The message to add the file to
            file_id: ID of the file to add
            extraction_prompt: Optional prompt for contextual extraction (e.g., for images)
            
        Returns:
            Updated message with the file added
        """
        if not self.lucydom_interface:
            logger.error("LucyDOM interface not available")
            return message
        
        try:
            # Get file metadata
            file = self.lucydom_interface.get_file(file_id)
            if not file:
                logger.warning(f"File with ID {file_id} not found")
                return message
            
            # Get necessary file information
            file_name = file.get("name", "unnamed_file")
            file_type = file.get("type", "unknown")
            content_type = file.get("content_type")
            
            # Initialize documents array if needed
            if "documents" not in message:
                message["documents"] = []
            
            # Check if file is already in the message
            file_already_added = any(
                doc.get("source", {}).get("id") == str(file_id) 
                for doc in message.get("documents", [])
            )
            
            if file_already_added:
                logger.info(f"File {file_name} already exists in message, skipping")
                return message
            
            # Create a unique document ID
            doc_id = f"doc_{uuid.uuid4()}"
            
            # Create document structure
            document = {
                "id": doc_id,
                "source": {
                    "type": "file",
                    "id": str(file_id),
                    "name": file_name,
                    "content_type": content_type,
                    "size": file.get("size"),
                    "upload_date": file.get("upload_date", datetime.now().isoformat())
                },
                "contents": []
            }
            
            # Only read content if we have extraction prompt or specific types
            if (extraction_prompt or 
                file_type in ["document", "text"] or 
                (content_type and content_type.startswith("text/"))):
                
                # Read file content
                file_content = await self.lucydom_interface.read_file_content(file_id)
                
                if file_content:
                    # Process based on file type
                    if file_type == "image" or (content_type and content_type.startswith("image/")):
                        # Image analysis if prompt provided
                        if self.ai_service and hasattr(self.ai_service, "analyze_image"):
                            try:
                                # Use provided prompt or default one
                                image_prompt = extraction_prompt or "Describe this image in detail"
                                
                                logger.info(f"Analyzing image {file_name} with prompt: {image_prompt}")
                                
                                image_analysis = await self.ai_service.analyze_image(
                                    image_data=file_content,
                                    prompt=image_prompt,
                                    mime_type=content_type
                                )
                                
                                # Add the analysis as text content
                                document["contents"].append({
                                    "type": "text",
                                    "text": f"Image Analysis:\n{image_analysis}",
                                    "is_extracted": True,
                                    "extraction_context": extraction_prompt
                                })
                                
                                logger.info(f"Added image analysis for {file_name} to message")
                            except Exception as e:
                                logger.error(f"Error analyzing image {file_name}: {str(e)}")
                                document["contents"].append({
                                    "type": "text",
                                    "text": f"Image file: {file_name} (Analysis failed: {str(e)})",
                                    "is_extracted": False
                                })
                        else:
                            # Just add placeholder if no analysis available
                            document["contents"].append({
                                "type": "text",
                                "text": f"Image file: {file_name} (no analysis requested)",
                                "is_extracted": False
                            })
                    
                    # Enhanced PDF processing - extract text and images
                    elif file_name.lower().endswith('.pdf'):
                        logger.info(f"Processing PDF file: {file_name}")
                        
                        # Extract text content first
                        from modules.agentservice_utils import extract_text_from_file_content
                        
                        text_content, is_extracted = extract_text_from_file_content(
                            file_content, file_name, content_type
                        )
                        
                        # Add text content
                        document["contents"].append({
                            "type": "text",
                            "text": text_content,
                            "is_extracted": is_extracted,
                            "extraction_context": extraction_prompt
                        })
                        
                        logger.info(f"Extracted text content from PDF {file_name}")
                        
                        # Extract and analyze images from PDF if we have AI service
                        if self.ai_service and hasattr(self.ai_service, "analyze_image"):
                            try:
                                # Import necessary modules
                                import fitz  # PyMuPDF
                                from io import BytesIO
                                
                                # Add detailed logging
                                logger.info(f"Starting PDF image extraction for {file_name}")
                                
                                # Check if extraction prompt is available or use default
                                image_prompt = extraction_prompt or "Describe this image from the PDF document"
                                
                                # Open PDF from memory stream with detailed error checking
                                try:
                                    pdf_document = fitz.open(stream=file_content, filetype="pdf")
                                    logger.info(f"Successfully opened PDF with {len(pdf_document)} pages")
                                except Exception as pdf_open_error:
                                    logger.error(f"Failed to open PDF: {str(pdf_open_error)}")
                                    raise
                                
                                # Initialize images list and image count
                                images_analysis = []
                                image_count = 0
                                
                                # Process each page
                                for page_num, page in enumerate(pdf_document, 1):
                                    # Get list of images on the page
                                    image_list = page.get_images(full=True)
                                    
                                    if image_list:
                                        logger.info(f"Found {len(image_list)} images on page {page_num}")
                                    
                                    # Process each image
                                    for img_index, img in enumerate(image_list):
                                        try:
                                            xref = img[0]  # Get image reference
                                            
                                            # Extract image data
                                            base_image = pdf_document.extract_image(xref)
                                            image_bytes = base_image["image"]
                                            image_ext = base_image["ext"]
                                            
                                            # Analyze image
                                            image_analysis = await self.ai_service.analyze_image(
                                                image_data=image_bytes,
                                                prompt=f"{image_prompt} (Page {page_num}, Image {img_index+1})",
                                                mime_type=f"image/{image_ext}"
                                            )
                                            
                                            # Add to analysis list
                                            images_analysis.append({
                                                "page": page_num,
                                                "index": img_index + 1,
                                                "analysis": image_analysis
                                            })
                                            
                                            image_count += 1
                                            logger.info(f"Analyzed image {img_index+1} on page {page_num}")
                                            
                                            # Create a separate document for each extracted image if needed
                                            if True:  # Set to condition if you want to control this
                                                img_doc_id = f"img_doc_{uuid.uuid4()}"
                                                image_filename = f"page{page_num}_image{img_index+1}.{image_ext}"
                                                
                                                image_document = {
                                                    "id": img_doc_id,
                                                    "source": {
                                                        "type": "extracted",
                                                        "parent_id": str(file_id),
                                                        "id": img_doc_id,
                                                        "name": image_filename,
                                                        "content_type": f"image/{image_ext}",
                                                        "size": len(image_bytes)
                                                    },
                                                    "contents": [{
                                                        "type": "text",
                                                        "text": f"Image Analysis (PDF Page {page_num}, Image {img_index+1}):\n{image_analysis}",
                                                        "is_extracted": True,
                                                        "extraction_context": image_prompt
                                                    }]
                                                }
                                                
                                                # Add image document to message
                                                message["documents"].append(image_document)
                                                logger.info(f"Added extracted image document {image_filename} to message")
                                                
                                        except Exception as img_err:
                                            logger.warning(f"Error processing image {img_index} on page {page_num}: {str(img_err)}")
                                
                                # Close the PDF
                                pdf_document.close()

                                # Add combined image analysis to the main document
                                if images_analysis:
                                    combined_analysis = "\n\n## Embedded Images Analysis\n\n"
                                    for img in images_analysis:
                                        combined_analysis += f"### Page {img['page']}, Image {img['index']}\n{img['analysis']}\n\n"
                                    
                                    document["contents"].append({
                                        "type": "text",
                                        "text": combined_analysis,
                                        "is_extracted": True,
                                        "extraction_context": f"Analysis of {image_count} images embedded in the PDF"
                                    })
                                    
                                    logger.info(f"Added combined analysis of {image_count} PDF images to document")
                            except ImportError:
                                logger.warning("PyMuPDF (fitz) is not installed, skipping PDF image extraction")
                                document["contents"].append({
                                    "type": "text",
                                    "text": "\n\nNote: PDF may contain images that were not extracted due to missing libraries.",
                                    "is_extracted": False
                                })
                            except Exception as e:
                                logger.error(f"Error extracting images from PDF {file_name}: {str(e)}")
                                document["contents"].append({
                                    "type": "text",
                                    "text": f"\n\nError extracting images from PDF: {str(e)}",
                                    "is_extracted": False
                                })
                
                    # Word document processing with image extraction
                    elif file_name.lower().endswith(('.docx', '.doc')):
                        logger.info(f"Processing Word document: {file_name}")
                        
                        # Extract text content first
                        from modules.agentservice_utils import extract_text_from_file_content
                        
                        text_content, is_extracted = extract_text_from_file_content(
                            file_content, file_name, content_type
                        )
                        
                        # Add text content
                        document["contents"].append({
                            "type": "text",
                            "text": text_content,
                            "is_extracted": is_extracted,
                            "extraction_context": extraction_prompt
                        })
                        
                        logger.info(f"Extracted text content from Word document {file_name}")
                        
                        # Attempt to extract and analyze images from Word document
                        if self.ai_service and hasattr(self.ai_service, "analyze_image"):
                            try:
                                # For .docx documents
                                if file_name.lower().endswith('.docx'):
                                    import zipfile
                                    from io import BytesIO
                                    
                                    # Check if extraction prompt is available or use default
                                    image_prompt = extraction_prompt or "Describe this image from the Word document"
                                    
                                    # Create a zipfile object from the .docx content
                                    docx_zip = zipfile.ZipFile(BytesIO(file_content))
                                    
                                    # Images in .docx are stored in the "word/media" directory
                                    image_files = [f for f in docx_zip.namelist() if f.startswith('word/media/')]
                                    
                                    if image_files:
                                        logger.info(f"Found {len(image_files)} images in Word document {file_name}")
                                        
                                        # Process each image
                                        images_analysis = []
                                        for i, img_path in enumerate(image_files):
                                            try:
                                                # Extract image data
                                                image_bytes = docx_zip.read(img_path)
                                                
                                                # Determine image type from filename
                                                image_ext = img_path.split('.')[-1] if '.' in img_path else 'png'
                                                
                                                # Analyze image
                                                image_analysis = await self.ai_service.analyze_image(
                                                    image_data=image_bytes,
                                                    prompt=f"{image_prompt} (Image {i+1})",
                                                    mime_type=f"image/{image_ext}"
                                                )
                                                
                                                # Add to analysis list
                                                images_analysis.append({
                                                    "index": i + 1,
                                                    "path": img_path,
                                                    "analysis": image_analysis
                                                })
                                                
                                                logger.info(f"Analyzed image {i+1} ({img_path}) from Word document")
                                                
                                                # Create a separate document for each extracted image if needed
                                                img_doc_id = f"img_doc_{uuid.uuid4()}"
                                                image_filename = f"word_image{i+1}.{image_ext}"
                                                
                                                image_document = {
                                                    "id": img_doc_id,
                                                    "source": {
                                                        "type": "extracted",
                                                        "parent_id": str(file_id),
                                                        "id": img_doc_id,
                                                        "name": image_filename,
                                                        "content_type": f"image/{image_ext}",
                                                        "size": len(image_bytes)
                                                    },
                                                    "contents": [{
                                                        "type": "text",
                                                        "text": f"Image Analysis (Word Document Image {i+1}):\n{image_analysis}",
                                                        "is_extracted": True,
                                                        "extraction_context": image_prompt
                                                    }]
                                                }
                                                
                                                # Add image document to message
                                                message["documents"].append(image_document)
                                                logger.info(f"Added extracted image document {image_filename} to message")
                                                
                                            except Exception as img_err:
                                                logger.warning(f"Error processing image {img_path}: {str(img_err)}")
                                        
                                        # Add combined image analysis to the main document
                                        if images_analysis:
                                            combined_analysis = "\n\n## Embedded Images Analysis\n\n"
                                            for img in images_analysis:
                                                combined_analysis += f"### Image {img['index']}\n{img['analysis']}\n\n"
                                            
                                            document["contents"].append({
                                                "type": "text",
                                                "text": combined_analysis,
                                                "is_extracted": True,
                                                "extraction_context": f"Analysis of {len(images_analysis)} images embedded in the Word document"
                                            })
                                            
                                            logger.info(f"Added combined analysis of {len(images_analysis)} Word document images")
                                    
                                    # Close the zip file
                                    docx_zip.close()
                                
                                # Note: For .doc (older format) we would need additional libraries
                                # This could be implemented with libraries like antiword or pywin32
                                elif file_name.lower().endswith('.doc'):
                                    logger.warning("Image extraction from .doc files is not supported yet")
                                    document["contents"].append({
                                        "type": "text",
                                        "text": "\n\nNote: This is an older .doc format document. Images may be present but could not be extracted.",
                                        "is_extracted": False
                                    })
                                    
                            except Exception as e:
                                logger.error(f"Error extracting images from Word document {file_name}: {str(e)}")
                                document["contents"].append({
                                    "type": "text",
                                    "text": f"\n\nError extracting images from Word document: {str(e)}",
                                    "is_extracted": False
                                })
                    
                    # Excel file processing with enhanced capabilities
                    elif file_name.lower().endswith(('.xlsx', '.xls')):
                        logger.info(f"Processing Excel document: {file_name}")
                        
                        # Extract text representation of spreadsheet data
                        from modules.agentservice_utils import extract_text_from_file_content
                        
                        text_content, is_extracted = extract_text_from_file_content(
                            file_content, file_name, content_type
                        )
                        
                        # Add text content
                        document["contents"].append({
                            "type": "text",
                            "text": text_content,
                            "is_extracted": is_extracted,
                            "extraction_context": extraction_prompt
                        })
                        
                        logger.info(f"Extracted data from Excel document {file_name}")
                        
                        # Try to extract charts and images if available
                        if self.ai_service and hasattr(self.ai_service, "analyze_image"):
                            try:
                                # For .xlsx files (newer format)
                                if file_name.lower().endswith('.xlsx'):
                                    import zipfile
                                    from io import BytesIO
                                    
                                    # Create a zipfile object from the Excel content
                                    xlsx_zip = zipfile.ZipFile(BytesIO(file_content))
                                    
                                    # Charts and images can be in various directories
                                    media_paths = [
                                        'xl/media/', 
                                        'xl/drawings/',
                                        'xl/charts/'
                                    ]
                                    
                                    # Collect all potential media files
                                    media_files = []
                                    for path in media_paths:
                                        media_files.extend([f for f in xlsx_zip.namelist() if f.startswith(path)])
                                    
                                    if media_files:
                                        logger.info(f"Found {len(media_files)} media files in Excel document {file_name}")
                                        
                                        # Process image files (skip XML and other non-image files)
                                        image_extensions = ['png', 'jpeg', 'jpg', 'gif', 'bmp', 'tiff', 'emf', 'wmf']
                                        image_files = [f for f in media_files if f.split('.')[-1].lower() in image_extensions]
                                        
                                        if image_files:
                                            logger.info(f"Found {len(image_files)} images/charts in Excel document {file_name}")
                                            
                                            image_prompt = extraction_prompt or "Describe this chart/image from the Excel document"
                                            images_analysis = []
                                            
                                            for i, img_path in enumerate(image_files):
                                                try:
                                                    # Extract image data
                                                    image_bytes = xlsx_zip.read(img_path)
                                                    
                                                    # Determine image type from filename
                                                    image_ext = img_path.split('.')[-1] if '.' in img_path else 'png'
                                                    
                                                    # Analyze image
                                                    image_analysis = await self.ai_service.analyze_image(
                                                        image_data=image_bytes,
                                                        prompt=f"{image_prompt} (Describe what this chart or image shows, including any data trends or patterns visible)",
                                                        mime_type=f"image/{image_ext}"
                                                    )
                                                    
                                                    # Add to analysis list
                                                    images_analysis.append({
                                                        "index": i + 1,
                                                        "path": img_path,
                                                        "analysis": image_analysis
                                                    })
                                                    
                                                    logger.info(f"Analyzed image/chart {i+1} from Excel document")
                                                    
                                                    # Create a separate document for each extracted image
                                                    img_doc_id = f"img_doc_{uuid.uuid4()}"
                                                    image_filename = f"excel_image{i+1}.{image_ext}"
                                                    
                                                    image_document = {
                                                        "id": img_doc_id,
                                                        "source": {
                                                            "type": "extracted",
                                                            "parent_id": str(file_id),
                                                            "id": img_doc_id,
                                                            "name": image_filename,
                                                            "content_type": f"image/{image_ext}",
                                                            "size": len(image_bytes)
                                                        },
                                                        "contents": [{
                                                            "type": "text",
                                                            "text": f"Chart/Image Analysis (Excel Document Item {i+1}):\n{image_analysis}",
                                                            "is_extracted": True,
                                                            "extraction_context": image_prompt
                                                        }]
                                                    }
                                                    
                                                    # Add image document to message
                                                    message["documents"].append(image_document)
                                                    
                                                except Exception as img_err:
                                                    logger.warning(f"Error processing image {img_path}: {str(img_err)}")
                                            
                                            # Add combined image analysis to the main document
                                            if images_analysis:
                                                combined_analysis = "\n\n## Embedded Charts and Images Analysis\n\n"
                                                for img in images_analysis:
                                                    combined_analysis += f"### Chart/Image {img['index']}\n{img['analysis']}\n\n"
                                                
                                                document["contents"].append({
                                                    "type": "text",
                                                    "text": combined_analysis,
                                                    "is_extracted": True,
                                                    "extraction_context": f"Analysis of {len(images_analysis)} charts/images from the Excel document"
                                                })
                                    
                                    # Close the zip file
                                    xlsx_zip.close()
                                    
                            except Exception as e:
                                logger.error(f"Error extracting charts/images from Excel document {file_name}: {str(e)}")
                    
                    else:
                        # For other file types, extract text
                        from modules.agentservice_utils import extract_text_from_file_content
                        
                        content, is_extracted = extract_text_from_file_content(
                            file_content, file_name, content_type
                        )
                        
                        document["contents"].append({
                            "type": "text",
                            "text": content,
                            "is_extracted": is_extracted,
                            "extraction_context": extraction_prompt
                        })
                        
                        logger.info(f"Added text content for {file_name} to message (extracted: {is_extracted})")
                else:
                    # No content available
                    document["contents"].append({
                        "type": "text",
                        "text": f"File content not available for {file_name}",
                        "is_extracted": False
                    })
            else:
                # Just add reference without content
                document["contents"].append({
                    "type": "text",
                    "text": f"File: {file_name} (content not loaded)",
                    "is_extracted": False
                })
            
            # Add document to message
            message["documents"].append(document)
            
            logger.info(f"File {file_name} successfully added to message")
            return message
            
        except Exception as e:
            logger.error(f"Error adding file {file_id} to message: {str(e)}")
            return message                                    

    
    async def extract_document_content(self, doc_id: str, message: Dict[str, Any], extraction_prompt: str) -> Dict[str, Any]:
        """
        Extract or update document content with contextual extraction.
        
        Args:
            doc_id: ID of the document to extract
            message: Message containing the document
            extraction_prompt: Contextual prompt for extraction
            
        Returns:
            Updated message with extracted content
        """
        if not message or "documents" not in message:
            return message
        
        updated_message = message.copy()
        
        # Find the document
        for i, document in enumerate(updated_message.get("documents", [])):
            if document.get("id") == doc_id:
                # Get file ID from source
                source = document.get("source", {})
                file_id = source.get("id")
                
                if file_id and self.lucydom_interface:
                    # Get file metadata
                    file = self.lucydom_interface.get_file(int(file_id))
                    if not file:
                        continue
                    
                    # Get file content
                    file_content = await self.lucydom_interface.read_file_content(int(file_id))
                    if not file_content:
                        continue
                    
                    # Process based on file type
                    file_name = file.get("name", "unnamed_file")
                    file_type = file.get("type", "unknown")
                    content_type = file.get("content_type")
                    
                    # Update content based on file type
                    if file_type == "image" or (content_type and content_type.startswith("image/")):
                        if self.ai_service and hasattr(self.ai_service, "analyze_image"):
                            try:
                                image_analysis = await self.ai_service.analyze_image(
                                    image_data=file_content,
                                    prompt=extraction_prompt,
                                    mime_type=content_type
                                )
                                
                                # Create or update content
                                new_content = {
                                    "type": "text",
                                    "text": f"Image Analysis:\n{image_analysis}",
                                    "is_extracted": True,
                                    "extraction_context": extraction_prompt
                                }
                                
                                # Update or add content
                                contents = document.get("contents", [])
                                contents_updated = False
                                
                                for j, content in enumerate(contents):
                                    if content.get("type") == "text":
                                        updated_message["documents"][i]["contents"][j] = new_content
                                        contents_updated = True
                                        break
                                
                                if not contents_updated:
                                    if not updated_message["documents"][i].get("contents"):
                                        updated_message["documents"][i]["contents"] = []
                                    updated_message["documents"][i]["contents"].append(new_content)
                                
                                logger.info(f"Updated image analysis for {file_name} with new context: {extraction_prompt}")
                            except Exception as e:
                                logger.error(f"Error updating image analysis for {file_name}: {str(e)}")
                    else:
                        # For other file types, extract text with new context
                        from modules.agentservice_utils import extract_text_from_file_content
                        
                        content, is_extracted = extract_text_from_file_content(
                            file_content, file_name, content_type
                        )
                        
                        new_content = {
                            "type": "text",
                            "text": content,
                            "is_extracted": is_extracted,
                            "extraction_context": extraction_prompt
                        }
                        
                        # Update or add content
                        contents = document.get("contents", [])
                        contents_updated = False
                        
                        for j, content_item in enumerate(contents):
                            if content_item.get("type") == "text":
                                updated_message["documents"][i]["contents"][j] = new_content
                                contents_updated = True
                                break
                        
                        if not contents_updated:
                            if not updated_message["documents"][i].get("contents"):
                                updated_message["documents"][i]["contents"] = []
                            updated_message["documents"][i]["contents"].append(new_content)
                        
                        logger.info(f"Updated text extraction for {file_name} with new context: {extraction_prompt}")
                
                # Found and processed the document, stop searching
                break
        
        return updated_message
    
    async def extract_files_from_workflow(self, workflow: Dict[str, Any], extraction_prompt: str, file_filter: str = None) -> Dict[str, Any]:
        """
        Extract all relevant files from a workflow with context-aware extraction.
        
        Args:
            workflow: The workflow object
            extraction_prompt: Contextual prompt for extraction
            file_filter: Optional filter for file types (e.g., "csv", "image")
            
        Returns:
            Dictionary with extracted content
        """
        # Import for data extraction
        from modules.agentservice_dataextraction import data_extraction
        
        # Get all files from the workflow
        files = []
        
        # Process all messages
        for message in workflow.get("messages", []):
            # Extract documents from the message
            for doc in message.get("documents", []):
                source = doc.get("source", {})
                
                # Only include file documents
                if source.get("type") == "file":
                    file_info = {
                        "id": source.get("id", ""),
                        "name": source.get("name", ""),
                        "type": source.get("type", ""),
                        "content_type": source.get("content_type", ""),
                        "size": source.get("size", 0)
                    }
                    
                    # Apply filter if provided
                    if file_filter:
                        file_name = file_info.get("name", "").lower()
                        content_type = file_info.get("content_type", "").lower()
                        
                        if (file_filter.lower() in file_name or 
                            file_filter.lower() in content_type):
                            # Check if file is already in the list
                            if not any(f.get("id") == file_info["id"] for f in files):
                                files.append(file_info)
                    else:
                        # No filter, include all files
                        if not any(f.get("id") == file_info["id"] for f in files):
                            files.append(file_info)
        
        # If no files found, return empty result
        if not files:
            return {
                "prompt": extraction_prompt,
                "files_processed": 0,
                "extracted_content": []
            }
        
        # Get all messages from the workflow
        workflow_messages = workflow.get("messages", [])
        
        # Extract data using the dataextraction module
        extracted_data = await data_extraction(
            prompt=extraction_prompt,
            files=files,
            messages=workflow_messages,
            ai_service=self.ai_service,
            lucydom_interface=self.lucydom_interface,
            workflow_id=self.workflow_id,
            add_log_func=None  # We don't have access to add_log_func here
        )
        
        return extracted_data
    
    def get_file_content_from_message(self, message: Dict[str, Any], file_id: int = None, doc_id: str = None) -> str:
        """
        Get file content from a message.
        
        Args:
            message: The message containing the document
            file_id: Optional file ID to search for
            doc_id: Optional document ID to search for
            
        Returns:
            Text content of the file if available
        """
        if not message or "documents" not in message:
            return ""
        
        # Search for the document
        for document in message.get("documents", []):
            # Match by document ID or file ID
            source = document.get("source", {})
            source_file_id = source.get("id")
            
            if ((doc_id and document.get("id") == doc_id) or 
                (file_id and source_file_id and str(file_id) == str(source_file_id))):
                
                # Get text content from document
                for content in document.get("contents", []):
                    if content.get("type") == "text":
                        return content.get("text", "")
        
        return ""
    
    def create_text_document(self, message: Dict[str, Any], content: str, title: str = "Generated Text") -> Dict[str, Any]:
        """
        Create a new text document in a message.
        
        Args:
            message: The message to add the document to
            content: Text content
            title: Document title
            
        Returns:
            Updated message with the new document
        """
        # Initialize documents array if needed
        updated_message = message.copy()
        if "documents" not in updated_message:
            updated_message["documents"] = []
        
        # Create document ID
        doc_id = f"doc_{uuid.uuid4()}"
        
        # Create document structure
        document = {
            "id": doc_id,
            "source": {
                "type": "generated",
                "id": doc_id,
                "name": title,
                "content_type": "text/plain",
                "size": len(content)
            },
            "contents": [
                {
                    "type": "text",
                    "text": content,
                    "is_extracted": True
                }
            ]
        }
        
        # Add document to message
        updated_message["documents"].append(document)
        
        logger.info(f"Created text document '{title}' in message")
        return updated_message

    def merge_document_contents(self, message: Dict[str, Any]) -> str:
        """
        Merge all document contents from a message into a single text.
        
        Args:
            message: The message containing documents
            
        Returns:
            Combined text content from all documents
        """
        if not message or "documents" not in message:
            return ""
        
        combined_text = ""
        
        for document in message.get("documents", []):
            source = document.get("source", {})
            doc_name = source.get("name", "Unnamed Document")
            
            # Extract text content
            doc_text = ""
            for content in document.get("contents", []):
                if content.get("type") == "text":
                    doc_text = content.get("text", "")
                    break
            
            if doc_text:
                combined_text += f"\n\n--- {doc_name} ---\n\n{doc_text}"
        
        return combined_text.strip()

# Factory function
def get_document_handler(workflow_id: str = None, lucydom_interface = None, ai_service = None) -> DocumentHandler:
    """Get a document handler instance."""
    return DocumentHandler(workflow_id, lucydom_interface, ai_service)