gateway/gwserver/modules/agentservice_filemanager.py

"""
Central file management module for the Agentservice.
"""

import os
import logging
import base64
import json
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union, BinaryIO
from io import BytesIO

# Import utilities from agentservice_utils
from modules.agentservice_utils import extract_text_from_file_content, is_text_extractable

logger = logging.getLogger(__name__)

# Helper function for adding logs
def _log(add_log_func, workflow_id, message, level="info"):
    """Helper function for adding logs with standardized formatting."""
    if add_log_func and workflow_id:
        add_log_func(workflow_id, message, level)

    # Also log to standard logger
    if level == "info":
        logger.info(message)
    elif level == "warning":
        logger.warning(message)
    elif level == "error":
        logger.error(message)

class FileExtractionError(Exception):
    """Exception for file extraction errors."""
    pass

class FileManager:
    """Central file management for the Agentservice."""

    _instance = None

    @classmethod
    def get_instance(cls):
        """Get the singleton instance of FileManager."""
        if cls._instance is None:
            cls._instance = cls()
        return cls._instance

    def __init__(self):
        """Initialize the FileManager."""
        # Ensure singleton pattern
        if FileManager._instance is not None:
            raise RuntimeError("Singleton instance already exists - use get_instance()")

        # Import utilities
        # Instead of storing file_utils, we'll use the imported functions directly

    async def read_file_contents(self,
        file_contexts: List[Dict[str, Any]],
        lucydom_interface,
        workflow_id: str = None,
        add_log_func = None,
        ai_service = None  # AI service parameter for image analysis
    ) -> Dict[str, Dict[str, Any]]:
        """
        Liest den Inhalt aller Dateien und führt bei Bildern und Dokumenten Analysen durch.
        Verwendet LucyDOM-Interface statt direkter Dateizugriffe.
        Gibt jetzt ein Dictionary mit Dateiinhalten und Extraktionsstatus zurück.

        Args:
            file_contexts: Liste der Dateikontexte mit Metadaten
            lucydom_interface: LucyDOM-Interface für Dateizugriffe
            workflow_id: Optionale ID des Workflows für Logging
            add_log_func: Optionale Funktion für das Hinzufügen von Logs
            ai_service: Optionaler AI-Service für die Bildanalyse

        Returns:
            Dictionary mit Dateiinhalten und Metadaten (file_id -> {content, is_extracted, ...})
        """
        file_contents = {}

        # Add debug logging
        logger.info(f"Reading contents of {len(file_contexts)} files for workflow {workflow_id}")

        for file in file_contexts:
            file_id = file["id"]
            file_name = file["name"]
            file_type = file.get("type", "unknown")
            content_type = file.get("content_type")

            print("DEGUB5:",file_name,file_type)

            try:
                # Dateiinhalt über LucyDOM-Interface abrufen
                file_data = await lucydom_interface.read_file_content(file_id)

                if not file_data:
                    _log(add_log_func, workflow_id, f"Datei {file_name} nicht gefunden", "warning")
                    file_contents[file_id] = {
                        "content": f"File content not available (File not found)",
                        "is_extracted": False,
                        "name": file_name,
                        "type": file_type,
                        "content_type": content_type
                    }
                    continue

                logger.info(f"Successfully read file: {file_name} (ID: {file_id}, Type: {file_type})")

                # Bildverarbeitung - immer KI-Analyse verwenden, wenn verfügbar
                if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                    if ai_service and hasattr(ai_service, "analyze_image"):
                        try:
                            image_analysis = await ai_service.analyze_image(
                                image_data=file_data,
                                prompt="Describe this image in detail",
                                mime_type=content_type
                            )

                            logger.debug(f"Image analysis successfully generated for {file_name}")

                            file_contents[file_id] = {
                                "content": f"Image Analysis:\n{image_analysis}",
                                "is_extracted": False,  # Bildanalyse gilt nicht als Text-Extraktion
                                "name": file_name,
                                "type": file_type,
                                "content_type": content_type
                            }
                            _log(add_log_func, workflow_id, f"Image {file_name} analyzed successfully", "info")
                        except Exception as e:
                            logger.error(f"Error analyzing image {file_name}: {str(e)}")
                            _log(add_log_func, workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
                            file_contents[file_id] = {
                                "content": f"Image file: {file_name} (Analysis failed: {str(e)})",
                                "is_extracted": False,
                                "name": file_name,
                                "type": file_type,
                                "content_type": content_type
                            }
                    else:
                        file_contents[file_id] = {
                            "content": f"Image file: {file_name} (AI analysis not available)",
                            "is_extracted": False,
                            "name": file_name,
                            "type": file_type,
                            "content_type": content_type
                        }

                # Dokument- und Textdateien
                elif (file_type == "document" or not file_type or file_name.lower().endswith(('.csv', '.txt', '.json', '.xml')) or (content_type and content_type.startswith('text/'))):
                    # Verwende die zentrale Textextraktionsfunktion mit Dateiinhalt
                    content, is_extracted = extract_text_from_file_content(
                        file_data, file_name, content_type
                    )
                    file_contents[file_id] = {
                        "content": content,
                        "is_extracted": is_extracted,
                        "name": file_name,
                        "type": file_type,
                        "content_type": content_type
                    }
                    _log(add_log_func, workflow_id,
                        f"File {file_name} read successfully (extracted: {is_extracted})", "info")

                # Andere Dateitypen - nur Metadaten speichern
                else:
                    file_contents[file_id] = {
                        "content": f"File: {file_name} (Type: {file_type}, content not available)",
                        "is_extracted": False,
                        "name": file_name,
                        "type": file_type,
                        "content_type": content_type
                    }
                    _log(add_log_func, workflow_id, f"Unsupported file type: {file_type} for {file_name}", "warning")

            except Exception as e:
                logger.error(f"Error reading file {file_name}: {str(e)}")
                _log(add_log_func, workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
                file_contents[file_id] = {
                    "content": f"File content not available (Error: {str(e)})",
                    "is_extracted": False,
                    "name": file_name,
                    "type": file_type,
                    "content_type": content_type
                }

        return file_contents

    @staticmethod
    def add_file_to_message(message: Dict[str, Any], file_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Fügt eine Datei zu einer Nachricht hinzu mit Kennzeichnung, ob Text extrahiert wurde.

        Args:
            message: Die zu erweiternde Nachricht
            file_data: Dateimetadaten und Inhalt

        Returns:
            Die aktualisierte Nachricht mit der Datei
        """
        # Detailliertes Logging für Debugging
        logger.info(f"Adding file to message: {file_data.get('name', 'unnamed_file')} (ID: {file_data.get('id', 'unknown')})")

        # Initialize documents array if needed
        if "documents" not in message:
            message["documents"] = []
            logger.debug("Initialized empty documents array in message")

        # Create a unique ID for the document if not provided
        doc_id = file_data.get("id", f"file_{uuid.uuid4()}")

        # Extract file size if available
        file_size = file_data.get("size")
        if isinstance(file_size, str) and file_size.isdigit():
            file_size = int(file_size)
        elif file_size is None and file_data.get("content"):
            # Estimate size from content if not provided
            file_size = len(file_data.get("content", ""))

        # Bestimmen, ob der Inhalt bereits extrahiert wurde
        content = file_data.get("content", "No content available")
        file_name = file_data.get("name", "unnamed_file")
        content_type = file_data.get("content_type")

        # Prüfen, ob der Inhalt als extrahiert markiert werden sollte
        is_extracted = file_data.get("is_extracted", False)
        if not is_extracted and isinstance(content, str) and content.strip() and file_name:
            # Wenn nicht explizit markiert, aber Inhalt vorhanden ist, prüfen wir den Dateityp
            is_extracted = is_text_extractable(file_name, content_type)

        # Create standard document structure that matches the data model
        document = {
            "id": doc_id,
            "source": {
                "type": "file",
                "id": file_data.get("id", doc_id),
                "name": file_name,
                "content_type": content_type,
                "size": file_size,
                "upload_date": file_data.get("upload_date", datetime.now().isoformat())
            },
            "contents": [
                {
                    "type": "text",
                    "text": content,
                    "is_extracted": is_extracted  # Flag für den Extraktionsstatus hinzufügen
                }
            ]
        }

        # Log document structure for debugging
        logger.debug(f"Created document structure: id={doc_id}, name={file_name}, is_extracted={is_extracted}")

        # Check if file is already in the message to avoid duplicates
        file_already_added = any(
            doc.get("source", {}).get("id") == file_data.get("id")
            for doc in message.get("documents", [])
        )

        if not file_already_added:
            message["documents"].append(document)
            logger.info(f"File {file_name} successfully added to message (total: {len(message.get('documents', []))} files)")
        else:
            logger.info(f"File {file_name} already exists in message, skipping")

        return message

    async def analyze_file(self, file_id: int, prompt: str, lucydom_interface, ai_service) -> Dict[str, Any]:
        """
        Analyze a file using the appropriate method based on file type.

        Args:
            file_id: ID of the file to analyze
            prompt: Analysis prompt
            lucydom_interface: Interface for database access
            ai_service: Service for AI requests

        Returns:
            Analysis result
        """
        if not lucydom_interface:
            raise ValueError("LucyDOM interface not available")

        if not ai_service:
            raise ValueError("AI service not available")

        try:
            # Get file metadata
            file = lucydom_interface.get_file(file_id)
            if not file:
                raise ValueError(f"File with ID {file_id} not found")

            # Get file content
            file_content = await lucydom_interface.read_file_content(file_id)
            if not file_content:
                raise ValueError(f"Content for file {file_id} not found")

            # Extract metadata
            file_name = file.get("name", "unnamed")
            content_type = file.get("content_type")
            file_type = file.get("type")

            # Process based on file type
            if file_type == "image" or (content_type and content_type.startswith("image/")):
                # Image analysis
                if hasattr(ai_service, "analyze_image"):
                    analysis = await ai_service.analyze_image(
                        image_data=file_content,
                        prompt=prompt,
                        mime_type=content_type
                    )

                    return {
                        "file_id": file_id,
                        "file_name": file_name,
                        "analysis_type": "image",
                        "result": analysis
                    }
                else:
                    raise ValueError("AI service does not support image analysis")

            elif file_name.endswith(".pdf"):
                # PDF analysis - first extract text, then analyze
                try:
                    # Extract text
                    text_content, is_extracted = extract_text_from_file_content(
                        file_content, file_name, content_type
                    )

                    if not is_extracted:
                        raise ValueError(f"Failed to extract text from PDF {file_name}")

                    # Analyze text with AI
                    pdf_analysis_prompt = f"""
                    Analyze the following PDF content based on this request:

                    REQUEST: {prompt}

                    PDF CONTENT:
                    {text_content[:10000]}  # Limit to first 10K chars to avoid token limits
                    """

                    analysis = await ai_service.call_api([{"role": "user", "content": pdf_analysis_prompt}])

                    # Also check for images in the PDF
                    has_images = False
                    image_analysis = None

                    try:
                        # Extract and analyze images
                        image_results = await self.extract_and_analyze_pdf_images(
                            file_content,
                            f"Analyze images with respect to: {prompt}",
                            ai_service
                        )

                        if image_results and len(image_results) > 0:
                            has_images = True
                            image_analysis = "\n\nPDF IMAGES ANALYSIS:\n"
                            for img in image_results:
                                image_analysis += f"- Image on page {img.get('page')}: {img.get('response')}\n"
                    except Exception as img_err:
                        logger.warning(f"Could not analyze images in PDF {file_name}: {str(img_err)}")

                    # Combine text and image analysis if available
                    if has_images and image_analysis:
                        analysis += image_analysis

                    return {
                        "file_id": file_id,
                        "file_name": file_name,
                        "analysis_type": "pdf",
                        "result": analysis,
                        "has_images": has_images
                    }

                except Exception as pdf_err:
                    logger.error(f"Error analyzing PDF {file_name}: {str(pdf_err)}")
                    raise

            elif file_name.endswith(('.xlsx', '.xls', '.csv')):
                # Tabular data analysis
                try:
                    # Extract text content
                    text_content, is_extracted = extract_text_from_file_content(
                        file_content, file_name, content_type
                    )

                    if not is_extracted:
                        raise ValueError(f"Failed to extract data from {file_name}")

                    # Analyze with AI
                    data_analysis_prompt = f"""
                    Analyze the following tabular data based on this request:

                    REQUEST: {prompt}

                    DATA CONTENT:
                    {text_content[:10000]}  # Limit to first 10K chars

                    Provide a structured analysis including:
                    1. Data overview
                    2. Key insights
                    3. Patterns and trends
                    4. Answers to the specific request
                    """

                    analysis = await ai_service.call_api([{"role": "user", "content": data_analysis_prompt}])

                    return {
                        "file_id": file_id,
                        "file_name": file_name,
                        "analysis_type": "tabular_data",
                        "result": analysis
                    }

                except Exception as data_err:
                    logger.error(f"Error analyzing tabular data {file_name}: {str(data_err)}")
                    raise

            else:
                # Default to text analysis for all other file types
                try:
                    # Extract text content
                    text_content, is_extracted = extract_text_from_file_content(
                        file_content, file_name, content_type
                    )

                    if not is_extracted:
                        raise ValueError(f"Failed to extract text from {file_name}")

                    # Analyze with AI
                    text_analysis_prompt = f"""
                    Analyze the following document content based on this request:

                    REQUEST: {prompt}

                    DOCUMENT CONTENT:
                    {text_content[:10000]}  # Limit to first 10K chars
                    """

                    analysis = await ai_service.call_api([{"role": "user", "content": text_analysis_prompt}])

                    return {
                        "file_id": file_id,
                        "file_name": file_name,
                        "analysis_type": "text",
                        "result": analysis
                    }

                except Exception as text_err:
                    logger.error(f"Error analyzing text content {file_name}: {str(text_err)}")
                    raise

        except Exception as e:
            logger.error(f"Error analyzing file {file_id}: {str(e)}")
            raise

    async def extract_and_analyze_pdf_images(self,
        pdf_content: bytes,
        prompt: str,
        ai_service
    ) -> List[Dict[str, Any]]:
        """
        Extrahiert Bilder aus einer PDF-Datei und analysiert sie.
        Arbeitet mit Binärdaten statt Dateipfaden.

        Args:
            pdf_content: Binärdaten der PDF-Datei
            prompt: Prompt für die Bildanalyse
            ai_service: AI-Service für die Bildanalyse

        Returns:
            Liste mit Analyseergebnissen für jedes Bild
        """
        image_responses = []
        temp_files = []  # Liste der temporären Dateien zur Bereinigung

        try:
            # PDF mit PyMuPDF öffnen
            import fitz  # PyMuPDF
            # BytesIO is already imported at the top level
            import tempfile

            # PDF im Speicher öffnen
            doc = fitz.open(stream=pdf_content, filetype="pdf")
            logger.info(f"PDF geöffnet mit {len(doc)} Seiten")

            for page_num, page in enumerate(doc, 1):
                # Alle Bilder auf der Seite finden
                image_list = page.get_images(full=True)

                if image_list:
                    logger.info(f"Seite {page_num}: {len(image_list)} Bilder gefunden")

                for img_index, img in enumerate(image_list):
                    try:
                        # Bild-Referenz
                        xref = img[0]

                        # Bild und Metadaten extrahieren
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]  # Tatsächliche Bilddaten
                        image_ext = base_image["ext"]      # Dateiendung (jpg, png, etc.)

                        # Erstelle temporäre Datei
                        fd, temp_img_path = tempfile.mkstemp(suffix=f".{image_ext}")
                        temp_files.append(temp_img_path)  # Zur Bereinigungsliste hinzufügen

                        with os.fdopen(fd, 'wb') as img_file:
                            img_file.write(image_bytes)

                        logger.debug(f"Bild temporär gespeichert: {temp_img_path}")

                        # Analysiere mit AI-Service
                        try:
                            analysis_result = await ai_service.analyze_image(
                                image_data=image_bytes,  # Direktes Übergeben der Bilddaten
                                prompt=prompt,
                                mime_type=f"image/{image_ext}"
                            )
                            logger.debug(f"Bildanalyse für Bild {img_index} auf Seite {page_num} abgeschlossen")
                        except Exception as analyze_error:
                            logger.error(f"Fehler bei der Bildanalyse: {str(analyze_error)}")
                            analysis_result = f"[Fehler bei der Bildanalyse: {str(analyze_error)}]"

                        # Ergebnis speichern
                        try:
                            # Versuche zuerst, die Größe aus base_image zu bekommen
                            if 'width' in base_image and 'height' in base_image:
                                image_size = f"{base_image['width']}x{base_image['height']}"
                            else:
                                # Alternative: Öffne das temporäre Bild, um die Größe zu bestimmen
                                from PIL import Image
                                with Image.open(temp_img_path) as img:
                                    width, height = img.size
                                    image_size = f"{width}x{height}"
                        except Exception as e:
                            logger.warning(f"Konnte Bildgröße nicht ermitteln: {str(e)}")
                            image_size = "unbekannt"

                        image_responses.append({
                            "page": page_num,
                            "image_index": img_index,
                            "format": image_ext,
                            "image_size": image_size,
                            "response": analysis_result
                        })

                    except Exception as e:
                        logger.warning(f"Fehler bei der Extraktion von Bild {img_index} auf Seite {page_num}: {str(e)}")
                        continue

            logger.info(f"Extrahiert und analysiert: {len(image_responses)} Bilder aus PDF")

        except ImportError:
            logger.error("PyMuPDF (fitz) ist nicht installiert. Installiere es mit 'pip install pymupdf'")
            raise FileExtractionError("PyMuPDF (fitz) ist nicht installiert")
        except Exception as e:
            logger.error(f"Fehler beim Extrahieren von PDF-Bildern: {str(e)}")
            raise FileExtractionError(f"Fehler beim Extrahieren von PDF-Bildern: {str(e)}")
        finally:
            # Bereinige alle temporären Dateien
            for temp_file in temp_files:
                try:
                    if os.path.exists(temp_file):
                        os.remove(temp_file)
                except Exception as e:
                    logger.warning(f"Konnte temporäre Datei nicht entfernen: {temp_file} - {str(e)}")

        return image_responses

    async def analyze_multiple_files(
        self,
        file_ids: List[int],
        prompt: str,
        lucydom_interface,
        ai_service
    ) -> Dict[str, Any]:
        """
        Analyze multiple files and synthesize a combined result.

        Args:
            file_ids: List of file IDs to analyze
            prompt: Analysis prompt
            lucydom_interface: Interface for database access
            ai_service: Service for AI requests

        Returns:
            Combined analysis result
        """
        results = []

        # Analyze each file
        for file_id in file_ids:
            try:
                analysis = await self.analyze_file(file_id, prompt, lucydom_interface, ai_service)
                results.append(analysis)
            except Exception as e:
                logger.error(f"Error analyzing file {file_id}: {str(e)}")
                results.append({
                    "file_id": file_id,
                    "error": str(e),
                    "analysis_type": "error"
                })

        # Now synthesize a combined analysis
        if results:
            try:
                # Prepare prompt for synthesis
                synthesis_prompt = f"""
                Synthesize a combined analysis based on these individual file analyses:

                ORIGINAL REQUEST: {prompt}

                INDIVIDUAL ANALYSES:
                """

                for i, result in enumerate(results, 1):
                    file_name = result.get("file_name", f"File {i}")
                    analysis_type = result.get("analysis_type", "unknown")
                    analysis_result = result.get("result", "No analysis available")

                    synthesis_prompt += f"""
                    ## {file_name} ({analysis_type})
                    {analysis_result}

                    ---
                    """

                synthesis_prompt += """
                Please provide a comprehensive synthesis that:
                1. Combines insights from all files
                2. Addresses the original request
                3. Highlights connections between different files
                4. Provides a unified conclusion
                """

                # Call AI for synthesis
                synthesis = await ai_service.call_api([{"role": "user", "content": synthesis_prompt}])

                return {
                    "synthesis": synthesis,
                    "individual_results": results,
                    "files_analyzed": len(results)
                }

            except Exception as e:
                logger.error(f"Error synthesizing combined analysis: {str(e)}")
                return {
                    "error": str(e),
                    "individual_results": results,
                    "files_analyzed": len(results)
                }
        else:
            return {
                "synthesis": "No files were successfully analyzed.",
                "individual_results": [],
                "files_analyzed": 0
            }

    def determine_file_type(self, file_name: str, content_type: str = None) -> str:
        """
        Determine the file type based on name and content type.

        Args:
            file_name: Name of the file
            content_type: MIME type (optional)

        Returns:
            File type string ('document', 'image', etc.)
        """
        # Check content type first
        if content_type:
            if content_type.startswith('image/'):
                return "image"
            elif content_type in ['application/pdf']:
                return "document"
            elif content_type in ['application/vnd.ms-excel',
                                 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                                 'text/csv']:
                return "spreadsheet"

        # Check file extension
        lower_name = file_name.lower()

        # Images
        if lower_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg')):
            return "image"

        # Documents
        if lower_name.endswith(('.pdf', '.doc', '.docx', '.txt', '.md', '.rtf')):
            return "document"

        # Spreadsheets
        if lower_name.endswith(('.xlsx', '.xls', '.csv')):
            return "spreadsheet"

        # Presentations
        if lower_name.endswith(('.pptx', '.ppt')):
            return "presentation"

        # Data files
        if lower_name.endswith(('.json', '.xml', '.yaml', '.yml')):
            return "data"

        # Default to document
        return "document"

    def get_mime_type(self, file_name: str) -> str:
        """Get MIME type based on file name."""
        # Import from lucydom_interface
        from lucydom_interface import LucyDOMInterface
        temp_interface = LucyDOMInterface(0, 0)  # Default values
        return temp_interface.get_mime_type(file_name)

    def prepare_file_contexts(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Bereitet die Dateikontexte basierend auf Metadaten vor.
        Akzeptiert keine Pfade mehr, sondern nur Metadaten aus der Datenbank.

        Args:
            files: Liste von Dateien mit Metadaten (Dict mit id, name, type, content_type)

        Returns:
            Liste von Dateikontexten für die Verarbeitung
        """
        file_contexts = []

        logger.info(f"Preparing file contexts for {len(files)} files")

        for file in files:
            file_id = file.get("id")
            file_name = file.get("name")
            file_type = file.get("type")

            # Create a comprehensive context with all available metadata
            context = {
                "id": file_id,
                "name": file_name,
                "type": file_type,
                "size": file.get("size", "Unbekannt"),
                "content_type": file.get("content_type"),
                "path": file.get("path"),
                "upload_date": file.get("upload_date"),
                "hash": file.get("hash"),
                "mandate_id": file.get("mandate_id"),
                "user_id": file.get("user_id")
            }

            # Log for debugging
            logger.info(f"Created file context: {file_name} (ID: {file_id}, Type: {file_type})")

            file_contexts.append(context)

        return file_contexts

    # Factory method
    @staticmethod
    def get_instance():
        """Get the singleton instance of FileManager."""
        if FileManager._instance is None:
            FileManager._instance = FileManager()
        return FileManager._instance


# Create a singleton instance for module-level access
file_manager = FileManager.get_instance()

def get_file_manager():
    """Get the singleton instance of FileManager."""
    return file_manager


class WorkflowFileManager:
    """
    Specialized file manager for workflow operations.
    Handles workflow-specific file operations and document management.
    """

    def __init__(self, workflow_id: str = None, lucydom_interface = None):
        """
        Initialize the workflow file manager.

        Args:
            workflow_id: Optional workflow ID for context
            lucydom_interface: LucyDOM interface for database operations
        """
        self.workflow_id = workflow_id
        self.lucydom_interface = lucydom_interface
        self.file_manager = get_file_manager()

    def set_workflow_id(self, workflow_id: str):
        """Set or update the workflow ID."""
        self.workflow_id = workflow_id

    def set_lucydom_interface(self, lucydom_interface):
        """Set or update the LucyDOM interface."""
        self.lucydom_interface = lucydom_interface

    async def add_files_to_message(self,
                                  message: Dict[str, Any],
                                  file_ids: List[int],
                                  add_log_func = None) -> Dict[str, Any]:
        """
        Add multiple files to a message.

        Args:
            message: The message to add files to
            file_ids: List of file IDs to add
            add_log_func: Optional logging function

        Returns:
            Updated message
        """
        if not self.lucydom_interface:
            _log(add_log_func, self.workflow_id, "LucyDOM interface not available", "error")
            return message

        updated_message = message.copy()

        # Get file metadata
        files = []
        for file_id in file_ids:
            file = self.lucydom_interface.get_file(file_id)
            if file:
                files.append(file)
            else:
                _log(add_log_func, self.workflow_id, f"File not found: {file_id}", "warning")

        # Prepare file contexts
        file_contexts = self.file_manager.prepare_file_contexts(files)

        # Read file contents
        file_contents = await self.file_manager.read_file_contents(
            file_contexts,
            self.lucydom_interface,
            self.workflow_id,
            add_log_func
        )

        # Add files to message
        for file_id, content_data in file_contents.items():
            # Add file to message
            updated_message = FileManager.add_file_to_message(updated_message, content_data)

        return updated_message

    def get_files_from_message(self, message: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Extract file references from a message.

        Args:
            message: The message to extract files from

        Returns:
            List of file metadata
        """
        files = []

        # Process documents
        for doc in message.get("documents", []):
            source = doc.get("source", {})

            # Only include file documents
            if source.get("type") == "file":
                file_info = {
                    "id": source.get("id", ""),
                    "name": source.get("name", ""),
                    "type": source.get("content_type", ""),
                    "content_type": source.get("content_type", ""),
                    "size": source.get("size", 0)
                }

                files.append(file_info)

        return files

    def get_document_text_content(self, message: Dict[str, Any]) -> str:
        """
        Extract text content from all documents in a message.

        Args:
            message: The message to extract content from

        Returns:
            Combined text content
        """
        content = ""

        # Process all documents
        for doc in message.get("documents", []):
            for doc_content in doc.get("contents", []):
                if doc_content.get("type") == "text":
                    content += "\n\n" + doc_content.get("text", "")

        return content

    async def extract_document_info(self,
                                   workflow: Dict[str, Any],
                                   message_id: str = None) -> Dict[str, Any]:
        """
        Extract document information from a workflow or specific message.

        Args:
            workflow: The workflow object
            message_id: Optional message ID to focus on a specific message

        Returns:
            Document information
        """
        result = {
            "documents": [],
            "file_count": 0,
            "extracted_text": ""
        }

        if message_id:
            # Process only the specified message
            for message in workflow.get("messages", []):
                if message.get("id") == message_id:
                    files = self.get_files_from_message(message)
                    result["documents"].extend(files)
                    result["file_count"] = len(files)
                    result["extracted_text"] = self.get_document_text_content(message)
                    break
        else:
            # Process all messages
            for message in workflow.get("messages", []):
                files = self.get_files_from_message(message)
                result["documents"].extend(files)
                result["extracted_text"] += self.get_document_text_content(message)

            # De-duplicate files
            unique_files = {}
            for file in result["documents"]:
                file_id = file.get("id")
                if file_id and file_id not in unique_files:
                    unique_files[file_id] = file

            result["documents"] = list(unique_files.values())
            result["file_count"] = len(result["documents"])

        return result

    async def analyze_workflow_documents(self,
                                        workflow: Dict[str, Any],
                                        prompt: str,
                                        ai_service,
                                        message_id: str = None) -> Dict[str, Any]:
        """
        Analyze documents in a workflow.

        Args:
            workflow: The workflow object
            prompt: Analysis prompt
            ai_service: Service for AI analysis
            message_id: Optional message ID to focus on specific message

        Returns:
            Analysis result
        """
        if not self.lucydom_interface:
            raise ValueError("LucyDOM interface not available")

        if not ai_service:
            raise ValueError("AI service not available")

        # Extract document info
        doc_info = await self.extract_document_info(workflow, message_id)

        if doc_info["file_count"] == 0:
            return {
                "result": "No documents found for analysis",
                "files_analyzed": 0
            }

        # Get file IDs
        file_ids = [doc.get("id") for doc in doc_info["documents"] if doc.get("id")]

        # Analyze files
        analysis = await self.file_manager.analyze_multiple_files(
            file_ids,
            prompt,
            self.lucydom_interface,
            ai_service
        )

        return analysis


# Export the workflow file manager factory function
def get_workflow_file_manager(workflow_id: str = None, lucydom_interface = None):
    """Get a workflow file manager instance."""
    return WorkflowFileManager(workflow_id, lucydom_interface)