gateway/gwserver/modules/agentservice_dataextraction.py

"""
Refactored helper function for intelligent data extraction (continued).
"""

import logging
import json
from typing import List, Dict, Any, Optional, Tuple
import asyncio
from datetime import datetime

logger = logging.getLogger(__name__)

async def data_extraction(
    prompt: str,
    files: List[Dict[str, Any]],
    messages: List[Dict[str, Any]],
    ai_service,
    lucydom_interface = None,
    workflow_id: str = None,
    add_log_func = None
) -> Dict[str, Any]:
    """
    Führt einen AI Call durch, um zu bestimmen, welche Inhalte aus welchen Dateiobjekten extrahiert werden sollen,
    und führt dann die notwendigen Extraktionen durch.

    Args:
        prompt: Spezifizierung, welche Daten extrahiert werden sollen
        files: Liste aller verfügbaren Dateien mit Metadaten
        messages: Liste aller Nachrichten im Workflow
        ai_service: Service für KI-Anfragen
        lucydom_interface: Interface für Datenbankzugriffe (optional)
        workflow_id: Optionale ID des Workflows für Logging
        add_log_func: Optionale Funktion für das Hinzufügen von Logs

    Returns:
        Strukturiertes Text-Objekt mit extrahierten Daten und Kontext-Informationen
    """
    try:
        # 1. AI Call zur Bestimmung der notwendigen Extraktionen
        extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func)

        # 2. Extraktionen durchführen
        extracted_data = await _execute_extractions(
            extraction_plan,
            files,
            messages,
            lucydom_interface,
            ai_service,
            workflow_id,
            add_log_func
        )

        # 3. Extrahierte Daten strukturieren
        structured_result = _structure_extracted_data(extracted_data, files, prompt)

        return structured_result

    except Exception as e:
        logger.error(f"Fehler bei der Datenextraktion: {str(e)}", exc_info=True)

        # Fehler-Log hinzufügen
        if add_log_func and workflow_id:
            add_log_func(workflow_id, f"Fehler bei der Datenextraktion: {str(e)}", "error")

        # Fehler-Ergebnis zurückgeben
        return {
            "error": str(e),
            "status": "error",
            "files_processed": len(files),
            "message": f"Die Datenextraktion konnte nicht durchgeführt werden: {str(e)}"
        }

async def _create_extraction_plan(
    prompt: str,
    files: List[Dict[str, Any]],
    messages: List[Dict[str, Any]],
    ai_service,
    workflow_id: str = None,
    add_log_func = None
) -> List[Dict[str, Any]]:
    """
    Erstellt einen Extraktionsplan mit AI-Unterstützung.

    Args:
        prompt: Spezifizierung, welche Daten extrahiert werden sollen
        files: Liste aller verfügbaren Dateien mit Metadaten
        messages: Liste aller Nachrichten im Workflow
        ai_service: Service für KI-Anfragen
        workflow_id: Optionale ID des Workflows für Logging
        add_log_func: Optionale Funktion für das Hinzufügen von Logs

    Returns:
        Extraktionsplan (Liste von Extraktionsanweisungen pro Datei)
    """
    # Erstelle Kontext-Informationen für den AI Call
    file_infos = []
    for file in files:
        # Basis-Metadaten
        file_info = {
            "id": file.get("id", ""),
            "name": file.get("name", ""),
            "type": file.get("type", ""),
            "content_type": file.get("content_type", ""),
            "size": file.get("size", "")
        }

        # Extraktionsstatus prüfen (falls vorhanden)
        doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)

        if doc_contents:
            # Prüfen, ob mindestens ein Content mit is_extracted=True existiert
            already_extracted = any(
                content.get("is_extracted", False) for content in doc_contents
            )
            file_info["already_extracted"] = already_extracted

            # Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar)
            for content in doc_contents:
                if content.get("type") == "text" and content.get("text"):
                    preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "")
                    file_info["content_preview"] = preview_text
                    break
        else:
            file_info["already_extracted"] = False

        file_infos.append(file_info)

    # AI-Prompt erstellen
    extraction_prompt = f"""
    Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien
    und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen.

    AUFGABE:
    {prompt}

    VERFÜGBARE DATEIEN:
    {json.dumps(file_infos, indent=2)}

    Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen:
    1. file_id: Die ID der zu extrahierenden Datei
    2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird)
    3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien)
    4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist)

    Format:
    [
      {{
        "file_id": 1234,
        "extract_needed": true,
        "extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen",
        "importance": 5
      }},
      ...
    ]

    Gib nur das JSON-Array zurück, ohne weitere Erklärungen.
    """

    # Log hinzufügen
    if add_log_func and workflow_id:
        add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info")

    try:
        # AI-Call durchführen
        extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])

        # JSON aus der Antwort extrahieren
        import re
        json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL)

        if json_match:
            extraction_plan = json.loads(json_match.group(0))

            # Log hinzufügen
            if add_log_func and workflow_id:
                add_log_func(
                    workflow_id,
                    f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien",
                    "info"
                )

            return extraction_plan
        else:
            # Fallback bei Parsing-Problemen
            if add_log_func and workflow_id:
                add_log_func(
                    workflow_id,
                    "Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan",
                    "warning"
                )

            # Standard-Plan: Alle nicht extrahierten Dateien extrahieren
            default_plan = []
            for file in files:
                doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
                already_extracted = any(
                    content.get("is_extracted", False) for content in doc_contents
                ) if doc_contents else False

                default_plan.append({
                    "file_id": file.get("id", 0),
                    "extract_needed": not already_extracted,
                    "extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}",
                    "importance": 3
                })

            return default_plan

    except Exception as e:
        logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True)

        if add_log_func and workflow_id:
            add_log_func(
                workflow_id,
                f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}",
                "error"
            )

        # Leerer Plan bei Fehlern
        return []

async def _execute_extractions(
    extraction_plan: List[Dict[str, Any]],
    files: List[Dict[str, Any]],
    messages: List[Dict[str, Any]],
    lucydom_interface,
    ai_service,
    workflow_id: str = None,
    add_log_func = None,
    logging_utils = None
) -> List[Dict[str, Any]]:
    """
    Execute the planned extractions.

    Args:
        extraction_plan: List of extraction instructions
        files: List of all available files
        lucydom_interface: Interface for database access
        ai_service: Service for AI requests
        workflow_id: Optional workflow ID for logging
        add_log_func: Optional function for adding logs
        logging_utils: Optional logging utility

    Returns:
        List with extracted data per file
    """
    extracted_data = []

    # Sort by importance
    sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)

    for extraction_item in sorted_plan:
        file_id = extraction_item.get("file_id")
        extract_needed = extraction_item.get("extract_needed", False)
        extraction_prompt = extraction_item.get("extraction_prompt", "")

        # Find file metadata
        file_metadata = next((f for f in files if f.get("id") == file_id), None)

        if not file_metadata:
            logger.warning(f"File with ID {file_id} not found")
            continue

        file_name = file_metadata.get("name", "")
        file_type = file_metadata.get("type", "")
        content_type = file_metadata.get("content_type", "")

        # Add log
        if logging_utils:
            logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction")
        elif add_log_func and workflow_id:
            add_log_func(
                workflow_id,
                f"Processing file: {file_name} (Extraction needed: {extract_needed})",
                "info"
            )

        # Only perform extraction if needed
        if extract_needed:
            # Get file content via LucyDOM interface
            if lucydom_interface:
                try:
                    file_content = await lucydom_interface.read_file_content(file_id)

                    if not file_content:
                        if logging_utils:
                            logging_utils.warning(f"File {file_name} not found", "extraction")
                        elif add_log_func and workflow_id:
                            add_log_func(workflow_id, f"File {file_name} not found", "warning")
                        continue

                    # Perform extraction based on file type
                    if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                        # Image analysis with AI service
                        if ai_service and hasattr(ai_service, "analyze_image"):
                            try:
                                image_analysis = await ai_service.analyze_image(
                                    image_data=file_content,
                                    prompt=extraction_prompt,
                                    mime_type=content_type
                                )

                                extracted_data.append({
                                    "file_id": file_id,
                                    "name": file_name,
                                    "type": file_type,
                                    "content": image_analysis,
                                    "is_extracted": True,
                                    "extraction_method": "image_analysis"
                                })

                                if logging_utils:
                                    logging_utils.info(f"Image {file_name} successfully analyzed", "extraction")
                                elif add_log_func and workflow_id:
                                    add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info")
                            except Exception as e:
                                logger.error(f"Error analyzing image {file_name}: {str(e)}")
                                if logging_utils:
                                    logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction")
                                elif add_log_func and workflow_id:
                                    add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
                        else:
                            # Fallback if no image analysis available
                            extracted_data.append({
                                "file_id": file_id,
                                "name": file_name,
                                "type": file_type,
                                "content": f"Image: {file_name} (Analysis not available)",
                                "is_extracted": False,
                                "extraction_method": "none"
                            })
                    else:
                        # Text-based extraction for all other file types
                        try:
                            # Import directly here to avoid circular imports
                            from modules.agentservice_utils import extract_text_from_file_content

                            content, is_extracted = extract_text_from_file_content(
                                file_content, file_name, content_type
                            )

                            extracted_data.append({
                                "file_id": file_id,
                                "name": file_name,
                                "type": file_type,
                                "content": content,
                                "is_extracted": is_extracted,
                                "extraction_method": "text_extraction"
                            })

                            if logging_utils:
                                logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction")
                            elif add_log_func and workflow_id:
                                add_log_func(
                                    workflow_id,
                                    f"File {file_name} extracted (Status: {is_extracted})",
                                    "info"
                                )
                        except Exception as e:
                            logger.error(f"Error extracting text from {file_name}: {str(e)}")
                            if logging_utils:
                                logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction")
                            elif add_log_func and workflow_id:
                                add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error")
                except Exception as e:
                    logger.error(f"Error reading file {file_name}: {str(e)}")
                    if logging_utils:
                        logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction")
                    elif add_log_func and workflow_id:
                        add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
            else:
                logger.warning(f"No LucyDOM interface available for file {file_name}")
                if logging_utils:
                    logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction")
                elif add_log_func and workflow_id:
                    add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning")
        else:
            # No extraction needed, use existing content
            doc_contents = _extract_document_contents_from_messages(file_id, messages)

            if doc_contents:
                # Use first text content
                for content in doc_contents:
                    if content.get("type") == "text":
                        extracted_data.append({
                            "file_id": file_id,
                            "name": file_name,
                            "type": file_type,
                            "content": content.get("text", ""),
                            "is_extracted": content.get("is_extracted", False),
                            "extraction_method": "existing_content"
                        })
                        break
            else:
                # No existing content found
                extracted_data.append({
                    "file_id": file_id,
                    "name": file_name,
                    "type": file_type,
                    "content": f"No content available for {file_name}",
                    "is_extracted": False,
                    "extraction_method": "none"
                })

    return extracted_data

def _structure_extracted_data(
    extracted_data: List[Dict[str, Any]],
    files: List[Dict[str, Any]],
    prompt: str
) -> Dict[str, Any]:
    """
    Structure the extracted data into a formatted result.

    Args:
        extracted_data: List of extracted data per file
        files: List of all available files
        prompt: Original extraction prompt

    Returns:
        Structured result object
    """
    # Create base structure
    result = {
        "prompt": prompt,
        "files_processed": len(extracted_data),
        "total_files": len(files),
        "extraction_timestamp": datetime.now().isoformat(),
        "status": "success",
        "extracted_content": []
    }

    # Add extracted content
    for data_item in extracted_data:
        # Enrich with file metadata
        file_id = data_item.get("file_id", 0)
        file_metadata = next((f for f in files if f.get("id") == file_id), {})

        content_item = {
            "file_id": file_id,
            "name": data_item.get("name", file_metadata.get("name", "")),
            "type": data_item.get("type", file_metadata.get("type", "")),
            "content_type": file_metadata.get("content_type", ""),
            "size": file_metadata.get("size", ""),
            "is_extracted": data_item.get("is_extracted", False),
            "extraction_method": data_item.get("extraction_method", ""),
            "content": data_item.get("content", "")
        }

        result["extracted_content"].append(content_item)

    return result

def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Extract document contents for a specific file from workflow messages.

    Args:
        file_id: ID of the file
        messages: List of all messages in the workflow

    Returns:
        List of document contents for the specified file
    """
    contents = []

    for message in messages:
        # Search documents in the message
        for document in message.get("documents", []):
            source = document.get("source", {})

            # Check if file ID matches
            if source.get("id") == file_id or (source.get("type") == "file" and source.get("id") == file_id):
                # Add contents of the file
                doc_contents = document.get("contents", [])

                if doc_contents:
                    contents.extend(doc_contents)

    return contents

def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None):
    """Helper function for logging with different log functions"""
    # Log via logger instance
    if log_type == "error":
        logger.error(message)
    elif log_type == "warning":
        logger.warning(message)
    else:
        logger.info(message)

    # Log via provided log function (if available)
    if add_log_func and workflow_id:
        add_log_func(workflow_id, message, log_type, agent_id, agent_name)