gateway/gwserver/modules/agentservice_part_filehandling.py

import os
import logging
import pandas as pd
from typing import Dict, Any, List, Optional, Tuple

# Logger konfigurieren
logger = logging.getLogger(__name__)

async def read_file_contents(
    file_contexts: List[Dict[str, Any]],
    upload_dir: str,
    workflow_id: str = None,
    add_log_func = None,
    ai_service = None  # Added AI service parameter for image analysis
) -> Dict[str, str]:
    """
    Enhanced function to read the contents of all files with proper image and document analysis.

    Args:
        file_contexts: List of file contexts with metadata
        upload_dir: Directory for uploads
        workflow_id: Optional ID of the workflow for logging
        add_log_func: Optional function for adding logs
        ai_service: Optional AI service for image analysis

    Returns:
        Dictionary with file contents (file_id -> content)
    """
    file_contents = {}

    for file in file_contexts:
        file_id = file["id"]
        file_name = file["name"]
        file_type = file.get("type", "unknown")
        file_path = file.get("path", "")

        # If path is not set, try to derive it from the upload directory
        if not file_path and file_name:
            possible_path = os.path.join(upload_dir, file_name)
            if os.path.exists(possible_path):
                file_path = possible_path
                file["path"] = file_path  # Update the path in context
                logger.debug(f"Found path for file {file_name}: {file_path}")

        # Read file content if path is available
        if file_path and os.path.exists(file_path):
            try:
                # Image files - always perform image analysis if AI service is available
                if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                    if ai_service:
                        try:
                            _log(add_log_func, workflow_id, f"Analyzing image {file_name}...", "info")
                            image_analysis = await ai_service.analyze_image(file_path, "Describe this image in detail")
                            file_contents[file_id] = f"Image Analysis:\n{image_analysis}"
                            _log(add_log_func, workflow_id, f"Image {file_name} analyzed successfully", "info")
                        except Exception as e:
                            logger.error(f"Error analyzing image {file_name}: {str(e)}")
                            _log(add_log_func, workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
                            file_contents[file_id] = f"Image file: {file_name} (Analysis failed: {str(e)})"
                    else:
                        file_contents[file_id] = f"Image file: {file_name} (AI analysis not available)"

                # Document files
                elif file_type == "document" or not file_type:
                    # Simple text files
                    if file_name.endswith(('.txt', '.md', '.json', '.xml', '.html', '.htm', '.css', '.js')):
                        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                            content = f.read()
                            file_contents[file_id] = content
                            _log(add_log_func, workflow_id, f"Text file {file_name} read successfully", "info")

                    # Excel files
                    elif file_name.endswith(('.xlsx', '.xls')):
                        try:
                            df = pd.read_excel(file_path)
                            file_contents[file_id] = f"Excel file with {len(df)} rows and {len(df.columns)} columns.\n"
                            file_contents[file_id] += f"Columns: {', '.join(df.columns.tolist())}\n\n"
                            file_contents[file_id] += df.to_string(index=False)  # Full table
                            _log(add_log_func, workflow_id, f"Excel file {file_name} read successfully", "info")
                        except Exception as e:
                            logger.error(f"Error reading Excel file {file_name}: {str(e)}")
                            _log(add_log_func, workflow_id, f"Error reading Excel file {file_name}: {str(e)}", "error")
                            file_contents[file_id] = f"Excel file: {file_name} (Reading failed: {str(e)})"

                    # CSV files
                    elif file_name.endswith('.csv'):
                        try:
                            # Try various encodings and delimiters for robust CSV parsing
                            try:
                                df = pd.read_csv(file_path, encoding='utf-8')
                            except UnicodeDecodeError:
                                try:
                                    df = pd.read_csv(file_path, encoding='latin1')
                                except:
                                    df = pd.read_csv(file_path, encoding='cp1252')

                            file_contents[file_id] = f"CSV file with {len(df)} rows and {len(df.columns)} columns.\n"
                            file_contents[file_id] += f"Columns: {', '.join(df.columns.tolist())}\n\n"
                            file_contents[file_id] += df.to_string(index=False)  # Full table
                            _log(add_log_func, workflow_id, f"CSV file {file_name} read successfully", "info")
                        except Exception as e:
                            logger.error(f"Error reading CSV file {file_name}: {str(e)}")
                            _log(add_log_func, workflow_id, f"Error reading CSV file {file_name}: {str(e)}", "error")
                            file_contents[file_id] = f"CSV file: {file_name} (Reading failed: {str(e)})"

                    # PDF files - with enhanced extraction and AI analysis
                    elif file_name.endswith('.pdf'):
                        try:
                            # Try PyPDF2 first
                            try:
                                from PyPDF2 import PdfReader
                                reader = PdfReader(file_path)
                                num_pages = len(reader.pages)
                                text = ""
                                for page in reader.pages:
                                    text += page.extract_text() + "\n\n"

                                # If AI service is available, also analyze images in PDF
                                if ai_service:
                                    _log(add_log_func, workflow_id, f"Analyzing PDF images in {file_name}...", "info")
                                    try:
                                        image_analysis_results = await ai_service.extract_and_analyze_pdf_images(
                                            file_path,
                                            "Describe this image in the context of the document"
                                        )

                                        if image_analysis_results:
                                            image_analysis_text = "\n\n=== PDF IMAGE ANALYSIS ===\n"
                                            for result in image_analysis_results:
                                                image_analysis_text += f"\nImage on page {result['page']}: {result['response']}\n"
                                            text += image_analysis_text
                                            _log(add_log_func, workflow_id,
                                                f"Successfully analyzed {len(image_analysis_results)} images in PDF",
                                                "info")
                                    except Exception as img_error:
                                        logger.error(f"Error analyzing PDF images: {str(img_error)}")
                                        _log(add_log_func, workflow_id,
                                            f"Error analyzing PDF images: {str(img_error)}",
                                            "warning")

                                file_contents[file_id] = f"PDF with {num_pages} pages.\nContent:\n{text}"
                                _log(add_log_func, workflow_id, f"PDF file {file_name} read successfully", "info")

                            except ImportError:
                                # Try to use a different PDF library if available
                                try:
                                    import fitz  # PyMuPDF
                                    doc = fitz.open(file_path)
                                    text = ""
                                    for page in doc:
                                        text += page.get_text() + "\n\n"
                                    file_contents[file_id] = f"PDF with {len(doc)} pages.\nContent:\n{text}"
                                    _log(add_log_func, workflow_id, f"PDF file {file_name} read with PyMuPDF", "info")
                                except ImportError:
                                    _log(add_log_func, workflow_id,
                                        "No PDF library installed. Cannot extract PDF content.", "warning")
                                    file_contents[file_id] = f"PDF file (content not available, PDF libraries missing)"
                        except Exception as e:
                            logger.error(f"Error reading PDF file {file_name}: {str(e)}")
                            _log(add_log_func, workflow_id, f"Error reading PDF file {file_name}: {str(e)}", "error")
                            file_contents[file_id] = f"PDF file: {file_name} (Reading failed: {str(e)})"

                    # Other document types
                    else:
                        try:
                            # Try to read as binary first to check file type
                            with open(file_path, 'rb') as f:
                                first_bytes = f.read(8)  # Read first few bytes to identify file type

                            # Try to read as text if it appears to be text-based
                            try:
                                with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                                    content = f.read()
                                file_contents[file_id] = content
                                _log(add_log_func, workflow_id, f"File {file_name} read as text", "info")
                            except Exception:
                                file_contents[file_id] = f"File content not available (Binary or unsupported format)"
                                _log(add_log_func, workflow_id, f"File {file_name} appears to be binary or has unknown format", "warning")
                        except Exception as e:
                            logger.error(f"Error processing file {file_name}: {str(e)}")
                            _log(add_log_func, workflow_id, f"Error processing file {file_name}: {str(e)}", "error")
                            file_contents[file_id] = f"File content not available (Error: {str(e)})"

                # Other file types - just store metadata
                else:
                    file_contents[file_id] = f"File: {file_name} (Type: {file_type}, content not available)"
                    _log(add_log_func, workflow_id, f"Unsupported file type: {file_type} for {file_name}", "warning")

            except Exception as e:
                logger.error(f"Error reading file {file_name}: {str(e)}")
                _log(add_log_func, workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
                file_contents[file_id] = f"File content not available (Error: {str(e)})"
        else:
            if file_path:
                _log(add_log_func, workflow_id, f"File {file_name} not found: {file_path}", "warning")
            else:
                _log(add_log_func, workflow_id, f"No path available for file {file_name}", "warning")
            file_contents[file_id] = f"File content not available (File not found)"

    return file_contents


def format_file_context_text(file_contexts: List[Dict[str, Any]], file_contents: Dict[str, str]) -> str:
    """
    Erstellt eine formatierte Textdarstellung aller Dateien und ihrer Inhalte

    Args:
        file_contexts: Liste der Dateikontexte mit Metadaten
        file_contents: Dictionary mit Dateiinhalten

    Returns:
        Formatierter Text mit Dateiliste und Inhaltsauszügen
    """
    # Erstelle einen Kontext mit Dateiliste und Inhalten für leichteren Zugriff
    file_context_text = "Verfügbare Dateien:\n" + "\n".join([
        f"- {file['name']} ({file['type']}, {file['size']}, ID: {file['id']})"
        for file in file_contexts
    ])

    # Füge Dateiinhalte hinzu (ohne Längenbegrenzung)
    for file_id, content in file_contents.items():
        file_name = next((f['name'] for f in file_contexts if f['id'] == file_id), "Unbekannte Datei")
        file_context_text += f"\n\n==== DATEIINHALT: {file_name} (ID: {file_id}) ====\n"
        file_context_text += content

    return file_context_text

def prepare_file_contexts(files: List[Dict[str, Any]], upload_dir: str) -> List[Dict[str, Any]]:
    """
    Bereitet die Dateikontexte vor und ermittelt die vollen Dateipfade

    Args:
        files: Liste von Dateien mit Metadaten (Dict mit id, name, type)
        upload_dir: Verzeichnis für Uploads

    Returns:
        Liste von Dateikontexten mit vollständigen Pfaden
    """
    file_contexts = []

    for file in files:
        file_id = file["id"]
        file_name = file["name"]
        file_type = file["type"]
        file_path = file.get("path", "")

        # Wenn kein Pfad angegeben ist, versuche, ihn aus dem Upload-Verzeichnis abzuleiten
        if not file_path and file_name:
            possible_path = os.path.join(upload_dir, file_name)
            if os.path.exists(possible_path):
                file_path = possible_path
                logger.debug(f"Pfad für Datei {file_name} gefunden: {file_path}")

        file_contexts.append({
            "id": file_id,
            "name": file_name,
            "type": file_type,
            "size": file.get("size", "Unbekannt"),
            "path": file_path
        })

    return file_contexts

async def prepare_message_for_ai(
    file_contexts: List[Dict[str, Any]],
    prompt_text: str,
    file_contents: Dict[str, str],
    service_aichat
) -> Dict[str, Any]:
    """
    Enhanced function to prepare a complete message with all file contents for the AI model.
    Ensures proper file content integration and handles image analysis results.

    Args:
        file_contexts: List of file contexts with metadata
        prompt_text: The text prompt
        file_contents: Dictionary with file contents
        service_aichat: The AI service instance for special analyses

    Returns:
        A fully formatted message for the AI model
    """
    # Use the AI connector to create the message
    try:
        message = await service_aichat.parse_filedata(file_contexts, prompt_text, file_contents)

        # Ensure file contents are correctly integrated
        if isinstance(message, dict) and message.get("content") and isinstance(message["content"], list):
            # For each file context, ensure its content is included
            for file_context in file_contexts:
                file_id = file_context["id"]
                file_name = file_context["name"]

                # Check if file content is already included
                file_mentioned = False
                for content_item in message["content"]:
                    if isinstance(content_item, dict) and content_item.get("type") == "text":
                        if file_name in content_item.get("text", ""):
                            file_mentioned = True
                            break

                # If file is not mentioned but we have its content, add it
                if not file_mentioned and file_id in file_contents:
                    content = file_contents[file_id]
                    message["content"].append({
                        "type": "text",
                        "text": f"--- FILE: {file_name} ---\n\n{content}"
                    })
                    logger.info(f"Added missing file content for {file_name} to message")

        return message

    except Exception as e:
        logger.error(f"Error preparing message for AI: {str(e)}")

        # Create a basic message structure if the AI connector fails
        message = {
            "role": "user",
            "content": prompt_text + "\n\n"
        }

        # Manually add file contents
        if file_contents:
            file_content_text = "\n\n=== FILE CONTENTS ===\n\n"
            for file_id, content in file_contents.items():
                # Find file name from contexts
                file_name = next((f["name"] for f in file_contexts if f["id"] == file_id), f"File {file_id}")
                file_content_text += f"--- FILE: {file_name} ---\n\n{content}\n\n"

            # Append to message
            if isinstance(message["content"], str):
                message["content"] += file_content_text
            elif isinstance(message["content"], list):
                message["content"].append({
                    "type": "text",
                    "text": file_content_text
                })

        return message

def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None):
    """Hilfsfunktion zum Loggen mit unterschiedlichen Log-Funktionen"""
    # Log über die Logger-Instanz
    if log_type == "error":
        logger.error(message)
    elif log_type == "warning":
        logger.warning(message)
    else:
        logger.info(message)

    # Log über die bereitgestellte Log-Funktion (falls vorhanden)
    if add_log_func and workflow_id:
        add_log_func(workflow_id, message, log_type, agent_id, agent_name)

# Die folgenden Funktionen werden nicht mehr benötigt, da partielle Dateiladungen entfallen
# Sie sind hier auskommentiert, könnten später aber wieder aktiviert werden

"""
def parse_file_access_commands(agent_text: str) -> List[Dict[str, Any]]:
    # Diese Funktion wird vorerst nicht benötigt
    return []

def load_additional_file_content(
    workflow_id: str,
    file_id: str,
    file_contents: Dict[str, str],
    file_contexts: List[Dict[str, Any]],
    add_log_func = None,
    read_complete: bool = False,
    start_pos: int = None,
    end_pos: int = None,
    page_numbers: List[int] = None
) -> Optional[str]:
    # Diese Funktion wird vorerst nicht benötigt
    return None
"""