gateway/gwserver/modules/agentservice_filehandling.py

"""
Zentrales Filehandling-Modul für den Agentservice.
Enthält alle Funktionen für das Verarbeiten von Dateien.
Angepasst, um mit LucyDOMInterface als zentrale Datei-Autorität zu arbeiten.
"""

import os
import logging
import base64
import json
import uuid
from datetime import datetime
from typing import Dict, Any, List, Optional, Tuple, Union, BinaryIO
from io import BytesIO  # Import BytesIO at the top level

# Bibliotheken für Dateiverarbeitung
try:
    import pandas as pd
except ImportError:
    pd = None

logger = logging.getLogger(__name__)

# Custom exception für das File-Handling
class FileProcessingError(Exception):
    """Basisklasse für Fehler bei der Dateiverarbeitung im AgentService."""
    pass

class FileExtractionError(FileProcessingError):
    """Fehler bei der Textextraktion aus Dateien."""
    pass

class FileAnalysisError(FileProcessingError):
    """Fehler bei der Analyse von Dateien."""
    pass

def encode_to_base64(content: bytes, mime_type: str = None) -> str:
    """
    Kodiert Binärdaten als Base64-String.

    Args:
        content: Die zu kodierenden Binärdaten
        mime_type: Optionaler MIME-Typ für das Encoding

    Returns:
        Base64-kodierter String
    """
    base64_data = base64.b64encode(content).decode('utf-8')
    return base64_data

def prepare_file_contexts(files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Bereitet die Dateikontexte basierend auf Metadaten vor.
    Akzeptiert keine Pfade mehr, sondern nur Metadaten aus der Datenbank.

    Args:
        files: Liste von Dateien mit Metadaten (Dict mit id, name, type, content_type)

    Returns:
        Liste von Dateikontexten für die Verarbeitung
    """
    file_contexts = []

    logger.info(f"Preparing file contexts for {len(files)} files")

    for file in files:
        file_id = file.get("id")
        file_name = file.get("name")
        file_type = file.get("type")

        # Create a comprehensive context with all available metadata
        context = {
            "id": file_id,
            "name": file_name,
            "type": file_type,
            "size": file.get("size", "Unbekannt"),
            "content_type": file.get("content_type"),
            "path": file.get("path"),
            "upload_date": file.get("upload_date"),
            "hash": file.get("hash"),
            "mandate_id": file.get("mandate_id"),
            "user_id": file.get("user_id")
        }

        # Log for debugging
        logger.info(f"Created file context: {file_name} (ID: {file_id}, Type: {file_type})")

        file_contexts.append(context)

    return file_contexts


def extract_text_from_file_content(file_content: bytes, file_name: str, content_type: str = None) -> str:
    """
    Extrahiert Text aus verschiedenen Dateiformaten basierend auf dem Binärinhalt.

    Args:
        file_content: Binärinhalt der Datei
        file_name: Name der Datei für die Erkennung des Formats
        content_type: Optional MIME-Typ der Datei

    Returns:
        Extrahierter Text oder Fehlermeldung
    """
    try:
        # Einfache Textdateien
        if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js', '.py')):
            try:
                return file_content.decode('utf-8')
            except UnicodeDecodeError:
                try:
                    return file_content.decode('latin1')
                except:
                    return file_content.decode('cp1252', errors='replace')

        # Excel-Dateien
        elif file_name.endswith(('.xlsx', '.xls')):
            if pd is not None:
                # Temporäre Datei im Speicher erstellen
                file_obj = BytesIO(file_content)
                df = pd.read_excel(file_obj)
                result = f"Excel file with {len(df)} rows and {len(df.columns)} columns.\n"
                result += f"Columns: {', '.join(df.columns.tolist())}\n\n"
                result += df.to_string(index=False)
                return result
            else:
                return f"[Excel-Datei: {file_name} - pandas nicht installiert]"

        # CSV-Dateien
        elif file_name.endswith('.csv'):
            if pd is not None:
                try:
                    # Temporäre Datei im Speicher erstellen
                    file_obj = BytesIO(file_content)
                    df = pd.read_csv(file_obj, encoding='utf-8')
                except UnicodeDecodeError:
                    file_obj = BytesIO(file_content)
                    try:
                        df = pd.read_csv(file_obj, encoding='latin1')
                    except:
                        file_obj = BytesIO(file_content)
                        df = pd.read_csv(file_obj, encoding='cp1252')

                result = f"CSV file with {len(df)} rows and {len(df.columns)} columns.\n"
                result += f"Columns: {', '.join(df.columns.tolist())}\n\n"
                result += df.to_string(index=False)
                return result
            else:
                return f"[CSV-Datei: {file_name} - pandas nicht installiert]"

        # PDF-Dateien
        elif file_name.endswith('.pdf'):
            try:
                try:
                    from PyPDF2 import PdfReader
                    # BytesIO is already imported at the top level
                    reader = PdfReader(BytesIO(file_content))
                    text = ""
                    for page in reader.pages:
                        text += page.extract_text() + "\n\n"
                    return text
                except ImportError:
                    try:
                        import fitz  # PyMuPDF
                        # BytesIO is already imported at the top level
                        doc = fitz.open(stream=file_content, filetype="pdf")
                        text = ""
                        for page in doc:
                            text += page.get_text() + "\n\n"
                        return text
                    except ImportError:
                        return f"[PDF: {file_name} - Keine PDF-Bibliothek installiert]"
            except Exception as e:
                raise FileExtractionError(f"Fehler beim Lesen der PDF-Datei {file_name}: {str(e)}")

        # Sonstige Dateien
        else:
            return f"[Datei: {file_name} - Textextraktion nicht unterstützt]"

    except Exception as e:
        logger.error(f"Fehler beim Extrahieren von Text aus {file_name}: {str(e)}")
        raise FileExtractionError(f"Fehler beim Extrahieren von Text aus {file_name}: {str(e)}")

async def extract_and_analyze_pdf_images(
    pdf_content: bytes,
    prompt: str,
    ai_service
) -> List[Dict[str, Any]]:
    """
    Extrahiert Bilder aus einer PDF-Datei und analysiert sie.
    Arbeitet mit Binärdaten statt Dateipfaden.

    Args:
        pdf_content: Binärdaten der PDF-Datei
        prompt: Prompt für die Bildanalyse
        ai_service: AI-Service für die Bildanalyse

    Returns:
        Liste mit Analyseergebnissen für jedes Bild
    """
    image_responses = []
    temp_files = []  # Liste der temporären Dateien zur Bereinigung

    try:
        # PDF mit PyMuPDF öffnen
        import fitz  # PyMuPDF
        # BytesIO is already imported at the top level
        import tempfile

        # PDF im Speicher öffnen
        doc = fitz.open(stream=pdf_content, filetype="pdf")
        logger.info(f"PDF geöffnet mit {len(doc)} Seiten")

        for page_num, page in enumerate(doc, 1):
            # Alle Bilder auf der Seite finden
            image_list = page.get_images(full=True)

            if image_list:
                logger.info(f"Seite {page_num}: {len(image_list)} Bilder gefunden")

            for img_index, img in enumerate(image_list):
                try:
                    # Bild-Referenz
                    xref = img[0]

                    # Bild und Metadaten extrahieren
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]  # Tatsächliche Bilddaten
                    image_ext = base_image["ext"]      # Dateiendung (jpg, png, etc.)

                    # Erstelle temporäre Datei
                    fd, temp_img_path = tempfile.mkstemp(suffix=f".{image_ext}")
                    temp_files.append(temp_img_path)  # Zur Bereinigungsliste hinzufügen

                    with os.fdopen(fd, 'wb') as img_file:
                        img_file.write(image_bytes)

                    logger.debug(f"Bild temporär gespeichert: {temp_img_path}")

                    # Analysiere mit AI-Service
                    try:
                        analysis_result = await ai_service.analyze_image(
                            image_data=image_bytes,  # Direktes Übergeben der Bilddaten
                            prompt=prompt,
                            mime_type=f"image/{image_ext}"
                        )
                        logger.debug(f"Bildanalyse für Bild {img_index} auf Seite {page_num} abgeschlossen")
                    except Exception as analyze_error:
                        logger.error(f"Fehler bei der Bildanalyse: {str(analyze_error)}")
                        analysis_result = f"[Fehler bei der Bildanalyse: {str(analyze_error)}]"

                    # Ergebnis speichern
                    try:
                        # Versuche zuerst, die Größe aus base_image zu bekommen
                        if 'width' in base_image and 'height' in base_image:
                            image_size = f"{base_image['width']}x{base_image['height']}"
                        else:
                            # Alternative: Öffne das temporäre Bild, um die Größe zu bestimmen
                            from PIL import Image
                            with Image.open(temp_img_path) as img:
                                width, height = img.size
                                image_size = f"{width}x{height}"
                    except Exception as e:
                        logger.warning(f"Konnte Bildgröße nicht ermitteln: {str(e)}")
                        image_size = "unbekannt"

                    image_responses.append({
                        "page": page_num,
                        "image_index": img_index,
                        "format": image_ext,
                        "image_size": image_size,
                        "response": analysis_result
                    })

                except Exception as e:
                    logger.warning(f"Fehler bei der Extraktion von Bild {img_index} auf Seite {page_num}: {str(e)}")
                    continue

        logger.info(f"Extrahiert und analysiert: {len(image_responses)} Bilder aus PDF")

    except ImportError:
        logger.error("PyMuPDF (fitz) ist nicht installiert. Installiere es mit 'pip install pymupdf'")
        raise FileExtractionError("PyMuPDF (fitz) ist nicht installiert")
    except Exception as e:
        logger.error(f"Fehler beim Extrahieren von PDF-Bildern: {str(e)}")
        raise FileExtractionError(f"Fehler beim Extrahieren von PDF-Bildern: {str(e)}")
    finally:
        # Bereinige alle temporären Dateien
        for temp_file in temp_files:
            try:
                if os.path.exists(temp_file):
                    os.remove(temp_file)
            except Exception as e:
                logger.warning(f"Konnte temporäre Datei nicht entfernen: {temp_file} - {str(e)}")

    return image_responses


def add_file_to_message(message: Dict[str, Any], file_data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Fügt eine Datei zu einer Nachricht hinzu.
    Funktion für Workflow-Manager und interne Verwendung.

    Args:
        message: Die zu erweiternde Nachricht
        file_data: Dateimetadaten und Inhalt

    Returns:
        Die aktualisierte Nachricht mit der Datei
    """
    # Detailed logging for debugging
    logger.info(f"Adding file to message: {file_data.get('name', 'unnamed_file')} (ID: {file_data.get('id', 'unknown')})")

    # Initialize documents array if needed
    if "documents" not in message:
        message["documents"] = []
        logger.debug("Initialized empty documents array in message")

    # Create a unique ID for the document if not provided
    doc_id = file_data.get("id", f"file_{uuid.uuid4()}")

    # Extract file size if available
    file_size = file_data.get("size")
    if isinstance(file_size, str) and file_size.isdigit():
        file_size = int(file_size)
    elif file_size is None and file_data.get("content"):
        # Estimate size from content if not provided
        file_size = len(file_data.get("content", ""))

    # Create standard document structure that matches the data model
    document = {
        "id": doc_id,  # Add an ID to the document itself
        "source": {
            "type": "file",
            "id": file_data.get("id", doc_id),
            "name": file_data.get("name", "unnamed_file"),
            "content_type": file_data.get("content_type"),
            "size": file_size,
            "upload_date": file_data.get("upload_date", datetime.now().isoformat())
        },
        "contents": [
            {
                "type": "text",
                "text": file_data.get("content", "No content available")
            }
        ]
    }

    # Log document structure for debugging
    logger.debug(f"Created document structure: {json.dumps({k: v for k, v in document.items() if k != 'contents'})}")

    # Check if file is already in the message to avoid duplicates
    file_already_added = any(
        doc.get("source", {}).get("id") == file_data.get("id")
        for doc in message.get("documents", [])
    )

    if not file_already_added:
        message["documents"].append(document)
        logger.info(f"File {file_data.get('name')} successfully added to message (total: {len(message.get('documents', []))} files)")
    else:
        logger.info(f"File {file_data.get('name')} already exists in message, skipping")

    return message


def extract_files_from_message(message: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Extrahiert Dateiinformationen aus einer Nachricht.
    Funktion für Workflow-Manager und interne Verwendung.

    Args:
        message: Die Nachricht, aus der Dateien extrahiert werden sollen

    Returns:
        Liste der extrahierten Dateiinformationen
    """
    files = []

    if "documents" not in message:
        logger.debug("No documents found in message")
        return files

    # Log for debugging
    logger.debug(f"Extracting files from message with {len(message.get('documents', []))} documents")

    for doc in message.get("documents", []):
        doc_source = doc.get("source", {})

        # Nur Dateien extrahieren
        if doc_source.get("type") == "file":
            file_info = {
                "id": doc_source.get("id", f"file_{uuid.uuid4()}"),
                "name": doc_source.get("name", "unnamed_file"),
                "content_type": doc_source.get("content_type"),
                "size": doc_source.get("size")
            }

            # Inhalt extrahieren, falls vorhanden
            doc_contents = doc.get("contents", [])
            for content in doc_contents:
                if content.get("type") == "text":
                    file_info["content"] = content.get("text", "")
                    break

            logger.debug(f"Extracted file: {file_info.get('name')} (ID: {file_info.get('id')})")
            files.append(file_info)
        else:
            logger.debug(f"Skipping non-file document of type: {doc_source.get('type')}")

    logger.info(f"Extracted {len(files)} files from message")
    return files


async def read_file_contents(
    file_contexts: List[Dict[str, Any]],
    lucydom_interface,
    workflow_id: str = None,
    add_log_func = None,
    ai_service = None  # AI service parameter for image analysis
) -> Dict[str, str]:
    """
    Liest den Inhalt aller Dateien und führt bei Bildern und Dokumenten Analysen durch.
    Verwendet LucyDOM-Interface statt direkter Dateizugriffe.

    Args:
        file_contexts: Liste der Dateikontexte mit Metadaten
        lucydom_interface: LucyDOM-Interface für Dateizugriffe
        workflow_id: Optionale ID des Workflows für Logging
        add_log_func: Optionale Funktion für das Hinzufügen von Logs
        ai_service: Optionaler AI-Service für die Bildanalyse

    Returns:
        Dictionary mit Dateiinhalten (file_id -> content)
    """
    file_contents = {}

    # Add debug logging
    logger.info(f"Reading contents of {len(file_contexts)} files for workflow {workflow_id}")

    for file in file_contexts:
        file_id = file["id"]
        file_name = file["name"]
        file_type = file.get("type", "unknown")

        try:
            # Dateiinhalt über LucyDOM-Interface abrufen
            file_data = await lucydom_interface.read_file_content(file_id)

            if not file_data:
                _log(add_log_func, workflow_id, f"Datei {file_name} nicht gefunden", "warning")
                file_contents[file_id] = f"File content not available (File not found)"
                continue

            logger.info(f"Successfully read file: {file_name} (ID: {file_id}, Type: {file_type})")
            # Image files - always perform image analysis if AI service is available
            if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                if ai_service:
                    try:
                        #_log(add_log_func, workflow_id, f"Analyzing image {file_name} {len(file_data)}B...", "info")
                        logger.info(f"ai_service type: {type(ai_service)}")
                        logger.info(f"ai_service methods: {dir(ai_service)}")
                        logger.info(f"ai_service has analyze_image method: {'analyze_image' in dir(ai_service)}")

                        image_analysis = await ai_service.analyze_image(
                            image_data=file_data,
                            prompt="Describe this image in detail",
                            mime_type=file.get("content_type")
                        )

                        logger.debug(f"Image analysis successfully generated for {file_name}")

                        file_contents[file_id] = f"Image Analysis:\n{image_analysis}"
                        _log(add_log_func, workflow_id, f"Image {file_name} analyzed successfully", "info")
                    except Exception as e:
                        logger.error(f"Error analyzing image {file_name}: {str(e)}")
                        _log(add_log_func, workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
                        file_contents[file_id] = f"Image file: {file_name} (Analysis failed: {str(e)})"
                else:
                    file_contents[file_id] = f"Image file: {file_name} (AI analysis not available)"

            # Document files
            elif file_type == "document" or not file_type:
                # Verwende die zentrale Textextraktionsfunktion mit Dateiinhalt
                content = extract_text_from_file_content(file_data, file_name, file.get("content_type"))
                file_contents[file_id] = content
                _log(add_log_func, workflow_id, f"File {file_name} read successfully", "info")

            # Other file types - just store metadata
            else:
                file_contents[file_id] = f"File: {file_name} (Type: {file_type}, content not available)"
                _log(add_log_func, workflow_id, f"Unsupported file type: {file_type} for {file_name}", "warning")

        except Exception as e:
            logger.error(f"Error reading file {file_name}: {str(e)}")
            _log(add_log_func, workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
            file_contents[file_id] = f"File content not available (Error: {str(e)})"

    return file_contents


def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None):
    """Hilfsfunktion zum Loggen mit unterschiedlichen Log-Funktionen"""
    # Log über die Logger-Instanz
    if log_type == "error":
        logger.error(message)
    elif log_type == "warning":
        logger.warning(message)
    else:
        logger.info(message)

    # Log über die bereitgestellte Log-Funktion (falls vorhanden)
    if add_log_func and workflow_id:
        add_log_func(workflow_id, message, log_type, agent_id, agent_name)