gateway/gwserver/modules/agentservice_dataextraction.py

"""
Refactored helper function for intelligent data extraction (continued).
"""

import logging
import json
from typing import List, Dict, Any, Optional, Tuple
import asyncio
from datetime import datetime

logger = logging.getLogger(__name__)

async def data_extraction(
    prompt: str,
    files: List[Dict[str, Any]],
    messages: List[Dict[str, Any]],
    ai_service,
    lucydom_interface = None,
    workflow_id: str = None,
    add_log_func = None,
    document_handler = None  # Add this parameter
) -> Dict[str, Any]:
    """
    Performs AI-driven data extraction with support for the document handler.

    Args:
        prompt: Specification of what data to extract
        files: List of all available files with metadata
        messages: List of all messages in the workflow
        ai_service: Service for AI requests
        lucydom_interface: Interface for database access (optional)
        workflow_id: Optional workflow ID for logging
        add_log_func: Optional function for adding logs
        document_handler: Optional document handler for structured document operations

    Returns:
        Structured text object with extracted data and context information
    """
    try:
        # Create extraction plan using AI
        extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func)

        # Execute extractions, preferring document handler if available
        if document_handler:
            extracted_data = await _execute_extractions_with_handler(
                extraction_plan,
                files,
                messages,
                document_handler,
                ai_service,
                workflow_id,
                add_log_func
            )
        else:
            # Fall back to original implementation
            extracted_data = await _execute_extractions(
                extraction_plan,
                files,
                messages,
                lucydom_interface,
                ai_service,
                workflow_id,
                add_log_func
            )

        # Structure extracted data
        structured_result = _structure_extracted_data(extracted_data, files, prompt)

        return structured_result

    except Exception as e:
        logger.error(f"Error in data extraction: {str(e)}", exc_info=True)

        # Add error log
        if add_log_func and workflow_id:
            add_log_func(workflow_id, f"Data extraction error: {str(e)}", "error")

        # Return error result
        return {
            "error": str(e),
            "status": "error",
            "files_processed": len(files),
            "message": f"Data extraction failed: {str(e)}"
        }


async def _execute_extractions_with_handler(
    extraction_plan: List[Dict[str, Any]],
    files: List[Dict[str, Any]],
    messages: List[Dict[str, Any]],
    document_handler,
    ai_service,
    workflow_id: str = None,
    add_log_func = None
) -> List[Dict[str, Any]]:
    """
    Execute extractions using the document handler.

    Args:
        extraction_plan: List of extraction instructions
        files: List of all available files
        messages: List of all messages
        document_handler: Document handler for structured operations
        ai_service: Service for AI requests
        workflow_id: Optional workflow ID for logging
        add_log_func: Optional function for adding logs

    Returns:
        List with extracted data per file
    """
    extracted_data = []

    # Sort by importance (highest first)
    sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)

    for extraction_item in sorted_plan:
        file_id = extraction_item.get("file_id")
        extract_needed = extraction_item.get("extract_needed", False)
        extraction_prompt = extraction_item.get("extraction_prompt", "")

        # Find file metadata
        file_metadata = next((f for f in files if f.get("id") == file_id), None)

        if not file_metadata:
            logger.warning(f"File with ID {file_id} not found")
            continue

        file_name = file_metadata.get("name", "")
        file_type = file_metadata.get("type", "")
        content_type = file_metadata.get("content_type", "")

        # Log
        if add_log_func and workflow_id:
            add_log_func(
                workflow_id,
                f"Processing file: {file_name} (Extraction needed: {extract_needed})",
                "info"
            )

        # Only perform extraction if needed
        if extract_needed:
            # Find document in existing messages if available
            existing_content = _find_document_in_messages(file_id, messages)

            # Check if we should use document handler for contextual extraction
            if existing_content:
                # If document exists but needs contextual extraction
                document_id = existing_content.get("document_id")
                message_id = existing_content.get("message_id")

                if document_id and message_id:
                    # Find the message containing the document
                    for message in messages:
                        if message.get("id") == message_id:
                            # Extract content with context
                            try:
                                # Find document reference
                                doc_reference = None
                                for doc in message.get("documents", []):
                                    if doc.get("id") == document_id:
                                        doc_reference = doc
                                        break

                                if doc_reference:
                                    # Use document handler to perform contextual extraction
                                    extracted_text = await document_handler.extract_document_content(
                                        document_id,
                                        file_id,
                                        extraction_prompt
                                    )

                                    extracted_data.append({
                                        "file_id": file_id,
                                        "name": file_name,
                                        "type": file_type,
                                        "content": extracted_text,
                                        "is_extracted": True,
                                        "extraction_method": "contextual_extraction"
                                    })

                                    if add_log_func and workflow_id:
                                        add_log_func(
                                            workflow_id,
                                            f"Contextual extraction for {file_name}: {extraction_prompt}",
                                            "info"
                                        )

                                    continue
                            except Exception as e:
                                logger.error(f"Error in contextual extraction for {file_name}: {str(e)}")

            # If we reach here, we need to perform a new extraction
            try:
                file_content = await document_handler.add_file_to_message(
                    {},  # Empty message to extract just the document
                    file_id,
                    extraction_prompt
                )

                # Get the extracted content from the document
                if "documents" in file_content and file_content["documents"]:
                    doc = file_content["documents"][0]
                    content_text = ""
                    is_extracted = False

                    for content in doc.get("contents", []):
                        if content.get("type") == "text":
                            content_text = content.get("text", "")
                            is_extracted = content.get("is_extracted", False)
                            break

                    extracted_data.append({
                        "file_id": file_id,
                        "name": file_name,
                        "type": file_type,
                        "content": content_text,
                        "is_extracted": is_extracted,
                        "extraction_method": "document_handler"
                    })

                    if add_log_func and workflow_id:
                        add_log_func(
                            workflow_id,
                            f"Extracted {file_name} using document handler",
                            "info"
                        )
                else:
                    # Extraction failed
                    extracted_data.append({
                        "file_id": file_id,
                        "name": file_name,
                        "type": file_type,
                        "content": f"Failed to extract content from {file_name}",
                        "is_extracted": False,
                        "extraction_method": "failed"
                    })
            except Exception as e:
                logger.error(f"Error extracting {file_name}: {str(e)}")
                extracted_data.append({
                    "file_id": file_id,
                    "name": file_name,
                    "type": file_type,
                    "content": f"Error extracting: {str(e)}",
                    "is_extracted": False,
                    "extraction_method": "error"
                })
        else:
            # No extraction needed, use existing content
            existing_content = _find_document_in_messages(file_id, messages)

            if existing_content:
                extracted_data.append({
                    "file_id": file_id,
                    "name": file_name,
                    "type": file_type,
                    "content": existing_content.get("content", ""),
                    "is_extracted": existing_content.get("is_extracted", False),
                    "extraction_method": "existing_content"
                })
            else:
                # No existing content found
                extracted_data.append({
                    "file_id": file_id,
                    "name": file_name,
                    "type": file_type,
                    "content": f"No content available for {file_name}",
                    "is_extracted": False,
                    "extraction_method": "none"
                })

    return extracted_data


def _find_document_in_messages(file_id: int, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Find a document by file ID in workflow messages.

    Args:
        file_id: ID of the file to find
        messages: List of messages to search

    Returns:
        Dictionary with document information or empty dict if not found
    """
    for message in messages:
        for doc_index, document in enumerate(message.get("documents", [])):
            source = document.get("source", {})

            # Check if file ID matches
            if source.get("id") == str(file_id) or source.get("id") == file_id:
                # Found the document
                content_text = ""
                is_extracted = False

                # Look for text content
                for content in document.get("contents", []):
                    if content.get("type") == "text":
                        content_text = content.get("text", "")
                        is_extracted = content.get("is_extracted", False)
                        break

                return {
                    "document_id": document.get("id"),
                    "message_id": message.get("id"),
                    "content": content_text,
                    "is_extracted": is_extracted
                }

    return {}


async def _create_extraction_plan(
    prompt: str,
    files: List[Dict[str, Any]],
    messages: List[Dict[str, Any]],
    ai_service,
    workflow_id: str = None,
    add_log_func = None
) -> List[Dict[str, Any]]:
    """
    Erstellt einen Extraktionsplan mit AI-Unterstützung.

    Args:
        prompt: Spezifizierung, welche Daten extrahiert werden sollen
        files: Liste aller verfügbaren Dateien mit Metadaten
        messages: Liste aller Nachrichten im Workflow
        ai_service: Service für KI-Anfragen
        workflow_id: Optionale ID des Workflows für Logging
        add_log_func: Optionale Funktion für das Hinzufügen von Logs

    Returns:
        Extraktionsplan (Liste von Extraktionsanweisungen pro Datei)
    """
    # Erstelle Kontext-Informationen für den AI Call
    file_infos = []
    for file in files:
        # Basis-Metadaten
        file_info = {
            "id": file.get("id", ""),
            "name": file.get("name", ""),
            "type": file.get("type", ""),
            "content_type": file.get("content_type", ""),
            "size": file.get("size", "")
        }

        # Extraktionsstatus prüfen (falls vorhanden)
        doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)

        if doc_contents:
            # Prüfen, ob mindestens ein Content mit is_extracted=True existiert
            already_extracted = any(
                content.get("is_extracted", False) for content in doc_contents
            )
            file_info["already_extracted"] = already_extracted

            # Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar)
            for content in doc_contents:
                if content.get("type") == "text" and content.get("text"):
                    preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "")
                    file_info["content_preview"] = preview_text
                    break
        else:
            file_info["already_extracted"] = False

        file_infos.append(file_info)

    # AI-Prompt erstellen
    extraction_prompt = f"""
    Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien
    und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen.

    AUFGABE:
    {prompt}

    VERFÜGBARE DATEIEN:
    {json.dumps(file_infos, indent=2)}

    Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen:
    1. file_id: Die ID der zu extrahierenden Datei
    2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird)
    3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien)
    4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist)

    Format:
    [
      {{
        "file_id": 1234,
        "extract_needed": true,
        "extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen",
        "importance": 5
      }},
      ...
    ]

    Gib nur das JSON-Array zurück, ohne weitere Erklärungen.
    """

    # Log hinzufügen
    if add_log_func and workflow_id:
        add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info")

    try:
        # AI-Call durchführen
        extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])

        # JSON aus der Antwort extrahieren
        import re
        json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL)

        if json_match:
            extraction_plan = json.loads(json_match.group(0))

            # Log hinzufügen
            if add_log_func and workflow_id:
                add_log_func(
                    workflow_id,
                    f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien",
                    "info"
                )

            return extraction_plan
        else:
            # Fallback bei Parsing-Problemen
            if add_log_func and workflow_id:
                add_log_func(
                    workflow_id,
                    "Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan",
                    "warning"
                )

            # Standard-Plan: Alle nicht extrahierten Dateien extrahieren
            default_plan = []
            for file in files:
                doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
                already_extracted = any(
                    content.get("is_extracted", False) for content in doc_contents
                ) if doc_contents else False

                default_plan.append({
                    "file_id": file.get("id", 0),
                    "extract_needed": not already_extracted,
                    "extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}",
                    "importance": 3
                })

            return default_plan

    except Exception as e:
        logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True)

        if add_log_func and workflow_id:
            add_log_func(
                workflow_id,
                f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}",
                "error"
            )

        # Leerer Plan bei Fehlern
        return []

async def _execute_extractions(
    extraction_plan: List[Dict[str, Any]],
    files: List[Dict[str, Any]],
    messages: List[Dict[str, Any]],
    lucydom_interface,
    ai_service,
    workflow_id: str = None,
    add_log_func = None,
    logging_utils = None
) -> List[Dict[str, Any]]:
    """
    Execute the planned extractions.

    Args:
        extraction_plan: List of extraction instructions
        files: List of all available files
        lucydom_interface: Interface for database access
        ai_service: Service for AI requests
        workflow_id: Optional workflow ID for logging
        add_log_func: Optional function for adding logs
        logging_utils: Optional logging utility

    Returns:
        List with extracted data per file
    """
    extracted_data = []

    # Sort by importance
    sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)

    for extraction_item in sorted_plan:
        file_id = extraction_item.get("file_id")
        extract_needed = extraction_item.get("extract_needed", False)
        extraction_prompt = extraction_item.get("extraction_prompt", "")

        # Find file metadata
        file_metadata = next((f for f in files if f.get("id") == file_id), None)

        if not file_metadata:
            logger.warning(f"File with ID {file_id} not found")
            continue

        file_name = file_metadata.get("name", "")
        file_type = file_metadata.get("type", "")
        content_type = file_metadata.get("content_type", "")

        # Add log
        if logging_utils:
            logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction")
        elif add_log_func and workflow_id:
            add_log_func(
                workflow_id,
                f"Processing file: {file_name} (Extraction needed: {extract_needed})",
                "info"
            )

        # Only perform extraction if needed
        if extract_needed:
            # Get file content via LucyDOM interface
            if lucydom_interface:
                try:
                    file_content = await lucydom_interface.read_file_content(file_id)

                    if not file_content:
                        if logging_utils:
                            logging_utils.warning(f"File {file_name} not found", "extraction")
                        elif add_log_func and workflow_id:
                            add_log_func(workflow_id, f"File {file_name} not found", "warning")
                        continue

                    # Perform extraction based on file type
                    if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                        # Image analysis with AI service
                        if ai_service and hasattr(ai_service, "analyze_image"):
                            try:
                                image_analysis = await ai_service.analyze_image(
                                    image_data=file_content,
                                    prompt=extraction_prompt,
                                    mime_type=content_type
                                )

                                extracted_data.append({
                                    "file_id": file_id,
                                    "name": file_name,
                                    "type": file_type,
                                    "content": image_analysis,
                                    "is_extracted": True,
                                    "extraction_method": "image_analysis"
                                })

                                if logging_utils:
                                    logging_utils.info(f"Image {file_name} successfully analyzed", "extraction")
                                elif add_log_func and workflow_id:
                                    add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info")
                            except Exception as e:
                                logger.error(f"Error analyzing image {file_name}: {str(e)}")
                                if logging_utils:
                                    logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction")
                                elif add_log_func and workflow_id:
                                    add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
                        else:
                            # Fallback if no image analysis available
                            extracted_data.append({
                                "file_id": file_id,
                                "name": file_name,
                                "type": file_type,
                                "content": f"Image: {file_name} (Analysis not available)",
                                "is_extracted": False,
                                "extraction_method": "none"
                            })
                    else:
                        # Text-based extraction for all other file types
                        try:
                            # Import directly here to avoid circular imports
                            from modules.agentservice_utils import extract_text_from_file_content

                            content, is_extracted = extract_text_from_file_content(
                                file_content, file_name, content_type
                            )

                            extracted_data.append({
                                "file_id": file_id,
                                "name": file_name,
                                "type": file_type,
                                "content": content,
                                "is_extracted": is_extracted,
                                "extraction_method": "text_extraction"
                            })

                            if logging_utils:
                                logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction")
                            elif add_log_func and workflow_id:
                                add_log_func(
                                    workflow_id,
                                    f"File {file_name} extracted (Status: {is_extracted})",
                                    "info"
                                )
                        except Exception as e:
                            logger.error(f"Error extracting text from {file_name}: {str(e)}")
                            if logging_utils:
                                logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction")
                            elif add_log_func and workflow_id:
                                add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error")
                except Exception as e:
                    logger.error(f"Error reading file {file_name}: {str(e)}")
                    if logging_utils:
                        logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction")
                    elif add_log_func and workflow_id:
                        add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
            else:
                logger.warning(f"No LucyDOM interface available for file {file_name}")
                if logging_utils:
                    logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction")
                elif add_log_func and workflow_id:
                    add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning")
        else:
            # No extraction needed, use existing content
            doc_contents = _extract_document_contents_from_messages(file_id, messages)

            if doc_contents:
                # Use first text content
                for content in doc_contents:
                    if content.get("type") == "text":
                        extracted_data.append({
                            "file_id": file_id,
                            "name": file_name,
                            "type": file_type,
                            "content": content.get("text", ""),
                            "is_extracted": content.get("is_extracted", False),
                            "extraction_method": "existing_content"
                        })
                        break
            else:
                # No existing content found
                extracted_data.append({
                    "file_id": file_id,
                    "name": file_name,
                    "type": file_type,
                    "content": f"No content available for {file_name}",
                    "is_extracted": False,
                    "extraction_method": "none"
                })

    return extracted_data

def _structure_extracted_data(
    extracted_data: List[Dict[str, Any]],
    files: List[Dict[str, Any]],
    prompt: str
) -> Dict[str, Any]:
    """
    Structure the extracted data into a formatted result.

    Args:
        extracted_data: List of extracted data per file
        files: List of all available files
        prompt: Original extraction prompt

    Returns:
        Structured result object
    """
    # Create base structure
    result = {
        "prompt": prompt,
        "files_processed": len(extracted_data),
        "total_files": len(files),
        "extraction_timestamp": datetime.now().isoformat(),
        "status": "success",
        "extracted_content": []
    }

    # Add extracted content
    for data_item in extracted_data:
        # Enrich with file metadata
        file_id = data_item.get("file_id", 0)
        file_metadata = next((f for f in files if f.get("id") == file_id), {})

        content_item = {
            "file_id": file_id,
            "name": data_item.get("name", file_metadata.get("name", "")),
            "type": data_item.get("type", file_metadata.get("type", "")),
            "content_type": file_metadata.get("content_type", ""),
            "size": file_metadata.get("size", ""),
            "is_extracted": data_item.get("is_extracted", False),
            "extraction_method": data_item.get("extraction_method", ""),
            "content": data_item.get("content", "")
        }

        result["extracted_content"].append(content_item)

    return result

def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Extract document contents for a specific file from workflow messages.
    Enhanced to handle the new document structure.

    Args:
        file_id: ID of the file
        messages: List of all messages in the workflow

    Returns:
        List of document contents for the specified file
    """
    contents = []

    for message in messages:
        # Search documents in the message
        for document in message.get("documents", []):
            source = document.get("source", {})

            # Check if file ID matches (handle both string and int comparison)
            if (source.get("id") == file_id or
                (isinstance(source.get("id"), str) and source.get("id") == str(file_id)) or
                (isinstance(file_id, str) and source.get("id") == int(file_id))):

                # Add contents of the file
                doc_contents = document.get("contents", [])

                if doc_contents:
                    # Ensure each content has document reference
                    for content in doc_contents:
                        content_copy = content.copy()
                        content_copy["document_id"] = document.get("id")
                        content_copy["message_id"] = message.get("id")
                        contents.append(content_copy)

    return contents

def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None):
    """Helper function for logging with different log functions"""
    # Log via logger instance
    if log_type == "error":
        logger.error(message)
    elif log_type == "warning":
        logger.warning(message)
    else:
        logger.info(message)

    # Log via provided log function (if available)
    if add_log_func and workflow_id:
        add_log_func(workflow_id, message, log_type, agent_id, agent_name)