gateway/modules/agentservice_filemanager.py

"""
Central file management module for the Agentservice.
"""

import os
import logging
import base64
import json
import uuid
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Union, BinaryIO
from io import BytesIO

# Import utilities from agentservice_utils
from modules.agentservice_utils import extract_text_from_file_content, is_text_extractable

logger = logging.getLogger(__name__)

# Helper function for adding logs
def _log(add_log_func, workflow_id, message, level="info"):
    """Helper function for adding logs with standardized formatting."""
    if add_log_func and workflow_id:
        add_log_func(workflow_id, message, level)

    # Also log to standard logger
    if level == "info":
        logger.info(message)
    elif level == "warning":
        logger.warning(message)
    elif level == "error":
        logger.error(message)

class FileExtractionError(Exception):
    """Exception for file extraction errors."""
    pass


class FileManager:
    """Central file management for the Agentservice."""

    _instance = None

    @classmethod
    def get_instance(cls):
        """Get the singleton instance of FileManager."""
        if cls._instance is None:
            cls._instance = cls()
        return cls._instance

    def __init__(self):
        """Initialize the FileManager."""
        # Ensure singleton pattern
        if FileManager._instance is not None:
            raise RuntimeError("Singleton instance already exists - use get_instance()")

        # Import utilities
        # Instead of storing file_utils, we'll use the imported functions directly

    async def read_file_contents(self,
        file_contexts: List[Dict[str, Any]],
        lucydom_interface,
        workflow_id: str = None,
        add_log_func = None,
        ai_service = None,
        extraction_context: str = None  # Add this parameter
    ) -> Dict[str, Dict[str, Any]]:
        """
        Read file contents with optional contextual extraction.

        Args:
            file_contexts: List of file contexts with metadata
            lucydom_interface: LucyDOM interface for file access
            workflow_id: Optional workflow ID for logging
            add_log_func: Optional function for adding logs
            ai_service: AI service for image analysis
            extraction_context: Optional context prompt for extraction

        Returns:
            Dictionary with file contents and metadata
        """
        file_contents = {}
        # Add debug logging
        logger.info(f"Reading contents of {len(file_contexts)} files for workflow {workflow_id}")

        for file in file_contexts:
            file_id = file["id"]
            file_name = file["name"]
            file_type = file.get("type", "unknown")
            content_type = file.get("content_type")

            try:
                # Dateiinhalt über LucyDOM-Interface abrufen
                file_data = await lucydom_interface.read_file_content(file_id)

                if not file_data:
                    _log(add_log_func, workflow_id, f"Datei {file_name} nicht gefunden", "warning")
                    file_contents[file_id] = {
                        "content": f"File content not available (File not found)",
                        "is_extracted": False,
                        "name": file_name,
                        "type": file_type,
                        "content_type": content_type
                    }
                    continue

                logger.info(f"Successfully read file: {file_name} (ID: {file_id}, Type: {file_type})")

                # For image analysis, add extraction context
                if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                    if ai_service and hasattr(ai_service, "analyze_image"):
                        try:
                            # Use extraction context if provided
                            prompt = extraction_context or "Describe this image in detail"

                            image_analysis = await ai_service.analyze_image(
                                image_data=file_data,
                                prompt=prompt,  # Use contextual prompt
                                mime_type=content_type
                            )

                            file_contents[file_id] = {
                                "content": f"Image Analysis:\n{image_analysis}",
                                "is_extracted": True,  # Mark as extracted
                                "name": file_name,
                                "type": file_type,
                                "content_type": content_type,
                                "extraction_context": prompt  # Store the used prompt
                            }
                            _log(add_log_func, workflow_id, f"Image {file_name} analyzed successfully", "info")
                        except Exception as e:
                            logger.error(f"Error analyzing image {file_name}: {str(e)}")
                            _log(add_log_func, workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
                            file_contents[file_id] = {
                                "content": f"Image file: {file_name} (Analysis failed: {str(e)})",
                                "is_extracted": False,
                                "name": file_name,
                                "type": file_type,
                                "content_type": content_type
                            }
                    else:
                        file_contents[file_id] = {
                            "content": f"Image file: {file_name} (AI analysis not available)",
                            "is_extracted": False,
                            "name": file_name,
                            "type": file_type,
                            "content_type": content_type
                        }

                # Dokument- und Textdateien
                elif (file_type == "document" or not file_type or file_name.lower().endswith(('.csv', '.txt', '.json', '.xml')) or (content_type and content_type.startswith('text/'))):
                    # Verwende die zentrale Textextraktionsfunktion mit Dateiinhalt
                    content, is_extracted = extract_text_from_file_content(
                        file_data, file_name, content_type
                    )
                    file_contents[file_id] = {
                        "content": content,
                        "is_extracted": is_extracted,
                        "name": file_name,
                        "type": file_type,
                        "content_type": content_type
                    }
                    _log(add_log_func, workflow_id,
                        f"File {file_name} read successfully (extracted: {is_extracted})", "info")

                # Andere Dateitypen - nur Metadaten speichern
                else:
                    file_contents[file_id] = {
                        "content": f"File: {file_name} (Type: {file_type}, content not available)",
                        "is_extracted": False,
                        "name": file_name,
                        "type": file_type,
                        "content_type": content_type
                    }
                    _log(add_log_func, workflow_id, f"Unsupported file type: {file_type} for {file_name}", "warning")

            except Exception as e:
                logger.error(f"Error reading file {file_name}: {str(e)}")
                _log(add_log_func, workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
                file_contents[file_id] = {
                    "content": f"File content not available (Error: {str(e)})",
                    "is_extracted": False,
                    "name": file_name,
                    "type": file_type,
                    "content_type": content_type
                }

        return file_contents

    @staticmethod
    def add_file_to_message(message: Dict[str, Any], file_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Add a file to a message with consistent document structure.

        Args:
            message: The message to add the file to
            file_data: File metadata and content

        Returns:
            Updated message with the file added
        """
        logger.info(f"Adding file to message: {file_data.get('name', 'unnamed_file')} (ID: {file_data.get('id', 'unknown')})")

        # Initialize documents array if needed
        if "documents" not in message:
            message["documents"] = []

        # Create a unique ID for the document if not provided
        doc_id = file_data.get("id", f"file_{uuid.uuid4()}")

        # Extract metadata
        file_size = file_data.get("size")
        if isinstance(file_size, str) and file_size.isdigit():
            file_size = int(file_size)
        elif file_size is None and file_data.get("content"):
            file_size = len(file_data.get("content", ""))

        # Determine if content is already extracted
        content = file_data.get("content", "No content available")
        file_name = file_data.get("name", "unnamed_file")
        content_type = file_data.get("content_type")
        is_extracted = file_data.get("is_extracted", False)

        # Create standard document structure that follows the data model
        document = {
            "id": f"doc_{uuid.uuid4()}",  # Unique document ID separate from file ID
            "source": {
                "type": "file",
                "id": doc_id,
                "name": file_name,
                "content_type": content_type,
                "size": file_size,
                "upload_date": file_data.get("upload_date", datetime.now().isoformat())
            },
            "contents": [
                {
                    "type": "text",
                    "text": content,
                    "is_extracted": is_extracted,
                    "extraction_context": file_data.get("extraction_context", None)
                }
            ]
        }

        # Check if file is already in the message
        file_already_added = any(
            doc.get("source", {}).get("id") == doc_id
            for doc in message.get("documents", [])
        )

        if not file_already_added:
            message["documents"].append(document)
            logger.info(f"File {file_name} added to message (total: {len(message.get('documents', []))} files)")
        else:
            logger.info(f"File {file_name} already exists in message, skipping")

        return message


    async def analyze_file(self, file_id: int, prompt: str, lucydom_interface, ai_service) -> Dict[str, Any]:
        """
        Analyze a file using the appropriate method based on file type.

        Args:
            file_id: ID of the file to analyze
            prompt: Analysis prompt
            lucydom_interface: Interface for database access
            ai_service: Service for AI requests

        Returns:
            Analysis result
        """
        if not lucydom_interface:
            raise ValueError("LucyDOM interface not available")

        if not ai_service:
            raise ValueError("AI service not available")

        try:
            # Get file metadata
            file = lucydom_interface.get_file(file_id)
            if not file:
                raise ValueError(f"File with ID {file_id} not found")

            # Get file content
            file_content = await lucydom_interface.read_file_content(file_id)
            if not file_content:
                raise ValueError(f"Content for file {file_id} not found")

            # Extract metadata
            file_name = file.get("name", "unnamed")
            content_type = file.get("content_type")
            file_type = file.get("type")

            # Process based on file type
            if file_type == "image" or (content_type and content_type.startswith("image/")):
                # Image analysis
                if hasattr(ai_service, "analyze_image"):
                    analysis = await ai_service.analyze_image(
                        image_data=file_content,
                        prompt=prompt,
                        mime_type=content_type
                    )

                    return {
                        "file_id": file_id,
                        "file_name": file_name,
                        "analysis_type": "image",
                        "result": analysis
                    }
                else:
                    raise ValueError("AI service does not support image analysis")

            elif file_name.endswith(".pdf"):
                # PDF analysis - first extract text, then analyze
                try:
                    # Extract text
                    text_content, is_extracted = extract_text_from_file_content(
                        file_content, file_name, content_type
                    )

                    if not is_extracted:
                        raise ValueError(f"Failed to extract text from PDF {file_name}")

                    # Analyze text with AI
                    pdf_analysis_prompt = f"""
                    Analyze the following PDF content based on this request:

                    REQUEST: {prompt}

                    PDF CONTENT:
                    {text_content}  # In a future release to split into tokensets, if too big file
                    """

                    analysis = await ai_service.call_api([{"role": "user", "content": pdf_analysis_prompt}])

                    # Also check for images in the PDF
                    has_images = False
                    image_analysis = None

                    try:
                        # Extract and analyze images
                        image_results = await self.extract_and_analyze_pdf_images(
                            file_content,
                            f"Analyze images with respect to: {prompt}",
                            ai_service
                        )

                        if image_results and len(image_results) > 0:
                            has_images = True
                            image_analysis = "\n\nPDF IMAGES ANALYSIS:\n"
                            for img in image_results:
                                image_analysis += f"- Image on page {img.get('page')}: {img.get('response')}\n"
                    except Exception as img_err:
                        logger.warning(f"Could not analyze images in PDF {file_name}: {str(img_err)}")

                    # Combine text and image analysis if available
                    if has_images and image_analysis:
                        analysis += image_analysis

                    return {
                        "file_id": file_id,
                        "file_name": file_name,
                        "analysis_type": "pdf",
                        "result": analysis,
                        "has_images": has_images
                    }

                except Exception as pdf_err:
                    logger.error(f"Error analyzing PDF {file_name}: {str(pdf_err)}")
                    raise

            elif file_name.endswith(('.xlsx', '.xls', '.csv')):
                # Tabular data analysis
                try:
                    # Extract text content
                    text_content, is_extracted = extract_text_from_file_content(
                        file_content, file_name, content_type
                    )

                    if not is_extracted:
                        raise ValueError(f"Failed to extract data from {file_name}")

                    # Analyze with AI
                    data_analysis_prompt = f"""
                    Analyze the following tabular data based on this request:

                    REQUEST: {prompt}

                    DATA CONTENT:
                    {text_content}  # In a future release to split into tokensets to limit storage

                    Provide a structured analysis including:
                    1. Data overview
                    2. Key insights
                    3. Patterns and trends
                    4. Answers to the specific request
                    """

                    analysis = await ai_service.call_api([{"role": "user", "content": data_analysis_prompt}])

                    return {
                        "file_id": file_id,
                        "file_name": file_name,
                        "analysis_type": "tabular_data",
                        "result": analysis
                    }

                except Exception as data_err:
                    logger.error(f"Error analyzing tabular data {file_name}: {str(data_err)}")
                    raise

            else:
                # Default to text analysis for all other file types
                try:
                    # Extract text content
                    text_content, is_extracted = extract_text_from_file_content(
                        file_content, file_name, content_type
                    )

                    if not is_extracted:
                        raise ValueError(f"Failed to extract text from {file_name}")

                    # Analyze with AI
                    text_analysis_prompt = f"""
                    Analyze the following document content based on this request:

                    REQUEST: {prompt}

                    DOCUMENT CONTENT:
                    {text_content}  # In a future release to split into tokensets
                    """

                    analysis = await ai_service.call_api([{"role": "user", "content": text_analysis_prompt}])

                    return {
                        "file_id": file_id,
                        "file_name": file_name,
                        "analysis_type": "text",
                        "result": analysis
                    }

                except Exception as text_err:
                    logger.error(f"Error analyzing text content {file_name}: {str(text_err)}")
                    raise

        except Exception as e:
            logger.error(f"Error analyzing file {file_id}: {str(e)}")
            raise


    async def extract_and_analyze_pdf_images(self,
        pdf_content: bytes,
        prompt: str,
        ai_service
    ) -> List[Dict[str, Any]]:
        """
        Extract images from a PDF file and analyze them.
        Works with binary data instead of file paths.

        Args:
            pdf_content: Binary data of the PDF file
            prompt: Prompt for image analysis
            ai_service: AI service for image analysis

        Returns:
            List with analysis results for each image
        """
        image_responses = []
        temp_files = []  # List of temporary files for cleanup

        try:
            # Import required libraries
            try:
                import fitz  # PyMuPDF
                from io import BytesIO
                import tempfile

                logger.info(f"Starting PDF image extraction with PyMuPDF")
            except ImportError:
                logger.error("PyMuPDF (fitz) is not installed. Install it with 'pip install pymupdf'")
                return []

            # Open PDF in memory
            try:
                doc = fitz.open(stream=pdf_content, filetype="pdf")
                page_count = len(doc)
                logger.info(f"PDF opened with {page_count} pages")
            except Exception as pdf_err:
                logger.error(f"Error opening PDF: {str(pdf_err)}")
                return []

            # Process each page with multiple extraction methods
            for page_num, page in enumerate(doc, 1):
                logger.info(f"Processing page {page_num}/{page_count}")

                # Method 1: Standard extraction using get_images
                try:
                    image_list = page.get_images(full=True)
                    if image_list:
                        logger.info(f"Method 1: Found {len(image_list)} images on page {page_num}")

                        for img_index, img in enumerate(image_list):
                            try:
                                xref = img[0]  # Get image reference

                                # Extract image data
                                base_image = doc.extract_image(xref)
                                image_bytes = base_image["image"]
                                image_ext = base_image["ext"]

                                # Check for valid image data
                                if not image_bytes or len(image_bytes) < 100:
                                    logger.warning(f"Empty or very small image data for image {img_index+1} on page {page_num}")
                                    continue

                                # Analyze image
                                analysis_result = await ai_service.analyze_image(
                                    image_data=image_bytes,
                                    prompt=prompt,
                                    mime_type=f"image/{image_ext}"
                                )

                                # Store image size
                                image_size = f"{base_image.get('width', 0)}x{base_image.get('height', 0)}"

                                # Add result
                                image_responses.append({
                                    "page": page_num,
                                    "image_index": img_index,
                                    "format": image_ext,
                                    "image_size": image_size,
                                    "method": "get_images",
                                    "response": analysis_result
                                })

                                logger.info(f"Successfully analyzed image {img_index+1} on page {page_num} using method 1")
                            except Exception as e:
                                logger.warning(f"Error processing image {img_index} on page {page_num} (Method 1): {str(e)}")
                    else:
                        logger.info(f"Method 1: No images found on page {page_num} using get_images")
                except Exception as m1_err:
                    logger.warning(f"Error in Method 1 for page {page_num}: {str(m1_err)}")

                # Method 2: Extract embedded images using page.get_drawings()
                try:
                    drawings = page.get_drawings()
                    drawing_images = 0

                    for drawing_index, drawing in enumerate(drawings):
                        try:
                            # Check if drawing contains an image
                            if "image" in str(drawing).lower():
                                drawing_images += 1
                                rect = drawing["rect"]  # Get rectangle of the drawing

                                # Extract the area as an image
                                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=rect)
                                img_bytes = pix.tobytes("png")

                                # Analyze the image
                                analysis_result = await ai_service.analyze_image(
                                    image_data=img_bytes,
                                    prompt=f"{prompt} (Page {page_num}, Drawing {drawing_index+1})",
                                    mime_type="image/png"
                                )

                                # Add result
                                image_responses.append({
                                    "page": page_num,
                                    "image_index": drawing_index,
                                    "format": "png",
                                    "image_size": f"{pix.width}x{pix.height}",
                                    "method": "get_drawings",
                                    "response": analysis_result
                                })

                                logger.info(f"Successfully analyzed drawing image {drawing_index+1} on page {page_num} using method 2")
                        except Exception as drawing_err:
                            logger.warning(f"Error processing drawing {drawing_index} on page {page_num}: {str(drawing_err)}")

                    if drawing_images > 0:
                        logger.info(f"Method 2: Processed {drawing_images} images from drawings on page {page_num}")
                    else:
                        logger.info(f"Method 2: No images found in drawings on page {page_num}")
                except Exception as m2_err:
                    logger.warning(f"Error in Method 2 for page {page_num}: {str(m2_err)}")

                # Method 3: Extract using blocks detection
                try:
                    blocks = page.get_text("dict")["blocks"]
                    img_blocks = [b for b in blocks if b.get("type") == 1]  # type 1 = image

                    if img_blocks:
                        logger.info(f"Method 3: Found {len(img_blocks)} image blocks on page {page_num}")

                        for block_index, block in enumerate(img_blocks):
                            try:
                                # Extract using pixmap for the block region
                                rect = block["bbox"]
                                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=rect)
                                img_bytes = pix.tobytes("png")

                                # Analyze image
                                analysis_result = await ai_service.analyze_image(
                                    image_data=img_bytes,
                                    prompt=f"{prompt} (Page {page_num}, Block {block_index+1})",
                                    mime_type="image/png"
                                )

                                # Add result
                                image_responses.append({
                                    "page": page_num,
                                    "image_index": block_index,
                                    "format": "png",
                                    "image_size": f"{pix.width}x{pix.height}",
                                    "method": "block_extraction",
                                    "response": analysis_result
                                })

                                logger.info(f"Successfully analyzed image block {block_index+1} on page {page_num} using method 3")
                            except Exception as block_err:
                                logger.warning(f"Error processing block {block_index} on page {page_num}: {str(block_err)}")
                    else:
                        logger.info(f"Method 3: No image blocks found on page {page_num}")
                except Exception as m3_err:
                    logger.warning(f"Error in Method 3 for page {page_num}: {str(m3_err)}")

                # Method 4: Last resort - render the entire page as an image and analyze
                if not image_responses or not any(resp.get("page") == page_num for resp in image_responses):
                    try:
                        logger.info(f"Method 4: Rendering entire page {page_num} as image")

                        # Render the entire page as an image
                        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
                        img_bytes = pix.tobytes("png")

                        # Analyze the page as an image
                        analysis_result = await ai_service.analyze_image(
                            image_data=img_bytes,
                            prompt=f"{prompt} (Full page {page_num})",
                            mime_type="image/png"
                        )

                        # Add result
                        image_responses.append({
                            "page": page_num,
                            "image_index": 0,
                            "format": "png",
                            "image_size": f"{pix.width}x{pix.height}",
                            "method": "full_page_render",
                            "response": analysis_result
                        })

                        logger.info(f"Successfully analyzed full page {page_num} as image using method 4")
                    except Exception as m4_err:
                        logger.warning(f"Error in Method 4 for page {page_num}: {str(m4_err)}")

            # Close the document
            doc.close()

            # Deduplicate results (different methods might extract the same image)
            deduplicated_responses = []
            seen_areas = set()

            for response in image_responses:
                # Create a unique identifier for the image area
                area_key = f"{response['page']}_{response['image_size']}"

                if area_key not in seen_areas:
                    seen_areas.add(area_key)
                    deduplicated_responses.append(response)

            logger.info(f"PDF image extraction complete: Found {len(image_responses)} images, deduplicated to {len(deduplicated_responses)}")
            return deduplicated_responses

        except ImportError as imp_err:
            logger.error(f"Required library not available for PDF image extraction: {str(imp_err)}")
            return []
        except Exception as e:
            logger.error(f"Error extracting images from PDF: {str(e)}")
            return []
        finally:
            # Clean up temporary files
            for temp_file in temp_files:
                try:
                    if os.path.exists(temp_file):
                        os.remove(temp_file)
                except Exception as e:
                    logger.warning(f"Could not remove temporary file: {temp_file} - {str(e)}")


    async def analyze_multiple_files(
        self,
        file_ids: List[int],
        prompt: str,
        lucydom_interface,
        ai_service
    ) -> Dict[str, Any]:
        """
        Analyze multiple files and synthesize a combined result.

        Args:
            file_ids: List of file IDs to analyze
            prompt: Analysis prompt
            lucydom_interface: Interface for database access
            ai_service: Service for AI requests

        Returns:
            Combined analysis result
        """
        results = []

        # Analyze each file
        for file_id in file_ids:
            try:
                analysis = await self.analyze_file(file_id, prompt, lucydom_interface, ai_service)
                results.append(analysis)
            except Exception as e:
                logger.error(f"Error analyzing file {file_id}: {str(e)}")
                results.append({
                    "file_id": file_id,
                    "error": str(e),
                    "analysis_type": "error"
                })

        # Now synthesize a combined analysis
        if results:
            try:
                # Prepare prompt for synthesis
                synthesis_prompt = f"""
                Synthesize a combined analysis based on these individual file analyses:

                ORIGINAL REQUEST: {prompt}

                INDIVIDUAL ANALYSES:
                """

                for i, result in enumerate(results, 1):
                    file_name = result.get("file_name", f"File {i}")
                    analysis_type = result.get("analysis_type", "unknown")
                    analysis_result = result.get("result", "No analysis available")

                    synthesis_prompt += f"""
                    ## {file_name} ({analysis_type})
                    {analysis_result}

                    ---
                    """

                synthesis_prompt += """
                Please provide a comprehensive synthesis that:
                1. Combines insights from all files
                2. Addresses the original request
                3. Highlights connections between different files
                4. Provides a unified conclusion
                """

                # Call AI for synthesis
                synthesis = await ai_service.call_api([{"role": "user", "content": synthesis_prompt}])

                return {
                    "synthesis": synthesis,
                    "individual_results": results,
                    "files_analyzed": len(results)
                }

            except Exception as e:
                logger.error(f"Error synthesizing combined analysis: {str(e)}")
                return {
                    "error": str(e),
                    "individual_results": results,
                    "files_analyzed": len(results)
                }
        else:
            return {
                "synthesis": "No files were successfully analyzed.",
                "individual_results": [],
                "files_analyzed": 0
            }

    def determine_file_type(self, file_name: str, content_type: str = None) -> str:
        """
        Determine the file type based on name and content type.

        Args:
            file_name: Name of the file
            content_type: MIME type (optional)

        Returns:
            File type string ('document', 'image', etc.)
        """
        # Check content type first
        if content_type:
            if content_type.startswith('image/'):
                return "image"
            elif content_type in ['application/pdf']:
                return "document"
            elif content_type in ['application/vnd.ms-excel',
                                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                                'text/csv']:
                return "spreadsheet"

        # Check file extension
        lower_name = file_name.lower()

        # Images
        if lower_name.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg')):
            return "image"

        # Documents
        if lower_name.endswith(('.pdf', '.doc', '.docx', '.txt', '.md', '.rtf')):
            return "document"

        # Spreadsheets
        if lower_name.endswith(('.xlsx', '.xls', '.csv')):
            return "spreadsheet"

        # Presentations
        if lower_name.endswith(('.pptx', '.ppt')):
            return "presentation"

        # Data files
        if lower_name.endswith(('.json', '.xml', '.yaml', '.yml')):
            return "data"

        # Default to document
        return "document"

    def get_mime_type(self, file_name: str) -> str:
        """Get MIME type based on file name."""
        # Import from lucydom_interface
        from lucydom_interface import LucyDOMInterface
        temp_interface = LucyDOMInterface(0, 0)  # Default values
        return temp_interface.get_mime_type(file_name)

    def prepare_file_contexts(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Bereitet die Dateikontexte basierend auf Metadaten vor.
        Akzeptiert keine Pfade mehr, sondern nur Metadaten aus der Datenbank.

        Args:
            files: Liste von Dateien mit Metadaten (Dict mit id, name, type, content_type)

        Returns:
            Liste von Dateikontexten für die Verarbeitung
        """
        file_contexts = []

        logger.info(f"Preparing file contexts for {len(files)} files")

        for file in files:
            file_id = file.get("id")
            file_name = file.get("name")
            file_type = file.get("type")

            # Create a comprehensive context with all available metadata
            context = {
                "id": file_id,
                "name": file_name,
                "type": file_type,
                "size": file.get("size", "Unbekannt"),
                "content_type": file.get("content_type"),
                "path": file.get("path"),
                "upload_date": file.get("upload_date"),
                "hash": file.get("hash"),
                "mandate_id": file.get("mandate_id"),
                "user_id": file.get("user_id")
            }

            # Log for debugging
            logger.info(f"Created file context: {file_name} (ID: {file_id}, Type: {file_type})")

            file_contexts.append(context)

        return file_contexts

    def create_document_reference(self, message: Dict[str, Any], file_id: int, reference_type: str = "reference") -> Dict[str, Any]:
        """
        Create a document reference without loading content.

        Args:
            message: The message to add the reference to
            file_id: ID of the file to reference
            reference_type: Type of reference (reference, citation, etc.)

        Returns:
            Updated message with the document reference
        """
        if not self.lucydom_interface:
            logger.warning("LucyDOM interface not available for document reference")
            return message

        # Get file metadata
        file = self.lucydom_interface.get_file(file_id)
        if not file:
            logger.warning(f"File with ID {file_id} not found for reference")
            return message

        # Create document structure with just the reference
        document = {
            "id": f"ref_{uuid.uuid4()}",
            "source": {
                "type": "file",
                "id": str(file_id),
                "name": file.get("name", "referenced_file"),
                "content_type": file.get("content_type"),
                "size": file.get("size"),
                "reference_type": reference_type
            },
            "contents": []  # Empty contents - will be loaded on demand
        }

        # Add to message
        updated_message = message.copy()
        if "documents" not in updated_message:
            updated_message["documents"] = []

        updated_message["documents"].append(document)
        logger.info(f"Added document reference for file {file.get('name')} (ID: {file_id})")

        return updated_message

    def should_extract_document(self, document: Dict[str, Any], context_prompt: str = None) -> bool:
        """
        Determine if a document needs content extraction.

        Args:
            document: The document object
            context_prompt: Current context prompt

        Returns:
            True if extraction is needed, False otherwise
        """
        # If document has no contents, extraction is needed
        if not document.get("contents"):
            return True

        # If document has contents but extraction status is False, extraction may be needed
        for content in document.get("contents", []):
            if content.get("type") == "text":
                # If already extracted, check if context has changed
                if content.get("is_extracted", False):
                    # If context prompt is different from what was used previously,
                    # we may need to re-extract with the new context
                    prev_context = content.get("extraction_context")
                    if context_prompt and prev_context != context_prompt:
                        return True
                    return False
                return True

        # Default to needing extraction
        return True


    # Factory method
    @staticmethod
    def get_instance():
        """Get the singleton instance of FileManager."""
        if FileManager._instance is None:
            FileManager._instance = FileManager()
        return FileManager._instance


# Create a singleton instance for module-level access
file_manager = FileManager.get_instance()

def get_file_manager():
    """Get the singleton instance of FileManager."""
    return file_manager


class WorkflowFileManager:
    """
    Specialized file manager for workflow operations.
    Handles workflow-specific file operations and document management.
    """

    def __init__(self, workflow_id: str = None, lucydom_interface = None):
        """
        Initialize the workflow file manager.

        Args:
            workflow_id: Optional workflow ID for context
            lucydom_interface: LucyDOM interface for database operations
        """
        self.workflow_id = workflow_id
        self.lucydom_interface = lucydom_interface
        self.file_manager = get_file_manager()
        self.document_handler = None

    def set_workflow_id(self, workflow_id: str):
        """Set or update the workflow ID."""
        self.workflow_id = workflow_id

    def set_lucydom_interface(self, lucydom_interface):
        """Set or update the LucyDOM interface."""
        self.lucydom_interface = lucydom_interface

    async def add_files_to_message(self,
                                  message: Dict[str, Any],
                                  file_ids: List[int],
                                  add_log_func = None) -> Dict[str, Any]:
        """
        Add multiple files to a message.

        Args:
            message: The message to add files to
            file_ids: List of file IDs to add
            add_log_func: Optional logging function

        Returns:
            Updated message
        """

        # If document handler is available, use it
        if self.document_handler:
            return await self.document_handler.add_files_to_message(
                message,
                file_ids,
                extraction_prompt=None  # Default to no extraction
            )

        if not self.lucydom_interface:
            _log(add_log_func, self.workflow_id, "LucyDOM interface not available", "error")
            return message

        updated_message = message.copy()

        # Get file metadata
        files = []
        for file_id in file_ids:
            file = self.lucydom_interface.get_file(file_id)
            if file:
                files.append(file)
            else:
                _log(add_log_func, self.workflow_id, f"File not found: {file_id}", "warning")

        # Prepare file contexts
        file_contexts = self.file_manager.prepare_file_contexts(files)

        # Read file contents
        file_contents = await self.file_manager.read_file_contents(
            file_contexts,
            self.lucydom_interface,
            self.workflow_id,
            add_log_func
        )

        # Add files to message
        for file_id, content_data in file_contents.items():
            # Add file to message
            updated_message = FileManager.add_file_to_message(updated_message, content_data)

        return updated_message

    def get_files_from_message(self, message: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Extract file references from a message.

        Args:
            message: The message to extract files from

        Returns:
            List of file metadata
        """
        files = []

        # Process documents
        for doc in message.get("documents", []):
            source = doc.get("source", {})

            # Only include file documents
            if source.get("type") == "file":
                file_info = {
                    "id": source.get("id", ""),
                    "name": source.get("name", ""),
                    "type": source.get("content_type", ""),
                    "content_type": source.get("content_type", ""),
                    "size": source.get("size", 0)
                }

                files.append(file_info)

        return files

    def get_document_text_content(self, message: Dict[str, Any]) -> str:
        """
        Extract text content from all documents in a message.

        Args:
            message: The message to extract content from

        Returns:
            Combined text content
        """
        content = ""

        # Process all documents
        for doc in message.get("documents", []):
            for doc_content in doc.get("contents", []):
                if doc_content.get("type") == "text":
                    content += "\n\n" + doc_content.get("text", "")

        return content

    async def extract_document_info(self,
                                   workflow: Dict[str, Any],
                                   message_id: str = None) -> Dict[str, Any]:
        """
        Extract document information from a workflow or specific message.

        Args:
            workflow: The workflow object
            message_id: Optional message ID to focus on a specific message

        Returns:
            Document information
        """
        result = {
            "documents": [],
            "file_count": 0,
            "extracted_text": ""
        }

        if message_id:
            # Process only the specified message
            for message in workflow.get("messages", []):
                if message.get("id") == message_id:
                    files = self.get_files_from_message(message)
                    result["documents"].extend(files)
                    result["file_count"] = len(files)
                    result["extracted_text"] = self.get_document_text_content(message)
                    break
        else:
            # Process all messages
            for message in workflow.get("messages", []):
                files = self.get_files_from_message(message)
                result["documents"].extend(files)
                result["extracted_text"] += self.get_document_text_content(message)

            # De-duplicate files
            unique_files = {}
            for file in result["documents"]:
                file_id = file.get("id")
                if file_id and file_id not in unique_files:
                    unique_files[file_id] = file

            result["documents"] = list(unique_files.values())
            result["file_count"] = len(result["documents"])

        return result

    async def analyze_workflow_documents(self,
                                        workflow: Dict[str, Any],
                                        prompt: str,
                                        ai_service,
                                        message_id: str = None) -> Dict[str, Any]:
        """
        Analyze documents in a workflow.

        Args:
            workflow: The workflow object
            prompt: Analysis prompt
            ai_service: Service for AI analysis
            message_id: Optional message ID to focus on specific message

        Returns:
            Analysis result
        """
        if not self.lucydom_interface:
            raise ValueError("LucyDOM interface not available")

        if not ai_service:
            raise ValueError("AI service not available")

        # Extract document info
        doc_info = await self.extract_document_info(workflow, message_id)

        if doc_info["file_count"] == 0:
            return {
                "result": "No documents found for analysis",
                "files_analyzed": 0
            }

        # Get file IDs
        file_ids = [doc.get("id") for doc in doc_info["documents"] if doc.get("id")]

        # Analyze files
        analysis = await self.file_manager.analyze_multiple_files(
            file_ids,
            prompt,
            self.lucydom_interface,
            ai_service
        )

        return analysis

# Export the workflow file manager factory function
def get_workflow_file_manager(workflow_id: str = None, lucydom_interface = None):
    """Get a workflow file manager instance."""
    return WorkflowFileManager(workflow_id, lucydom_interface)