gateway/modules/agentservice_dataextraction.py

"""
Refactored helper function for intelligent data extraction (continued).
"""

import logging
import json
from typing import List, Dict, Any, Optional, Tuple
import asyncio
from datetime import datetime
import uuid

logger = logging.getLogger(__name__)

async def data_extraction(
    prompt: str,
    files: List[Dict[str, Any]],
    messages: List[Dict[str, Any]],
    ai_service,
    lucydom_interface = None,
    workflow_id: str = None,
    add_log_func = None,
    document_handler = None  # Add document handler parameter
) -> Dict[str, Any]:
    """
    Performs AI-driven data extraction with improved document and image handling.

    Args:
        prompt: Specification of what data to extract
        files: List of all available files with metadata
        messages: List of all messages in the workflow
        ai_service: Service for AI requests
        lucydom_interface: Interface for database access (optional)
        workflow_id: Optional workflow ID for logging
        add_log_func: Optional function for adding logs
        document_handler: Optional document handler for structured document operations

    Returns:
        Structured text object with extracted data and context information
    """
    try:
        # Log extraction start
        _log(add_log_func, workflow_id, f"Starting data extraction with {len(files)} files", "info")

        # Create enhanced extraction plan using AI
        _log(add_log_func, workflow_id, "Creating extraction plan", "info")
        extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func)

        # If we have extraction plan, log summary
        if extraction_plan:
            extract_needed_count = sum(1 for item in extraction_plan if item.get("extract_needed", False))
            _log(add_log_func, workflow_id,
                 f"Extraction plan created: {len(extraction_plan)} files, {extract_needed_count} need extraction", "info")

        # Execute extractions, preferring document handler if available
        if document_handler:
            _log(add_log_func, workflow_id, "Using document handler for extraction", "info")
            extracted_data = await _execute_extractions_with_handler(
                extraction_plan,
                files,
                messages,
                document_handler,
                ai_service,
                workflow_id,
                add_log_func
            )
        else:
            # Fall back to original implementation
            _log(add_log_func, workflow_id, "Using fallback extraction method", "info")
            extracted_data = await _execute_extractions(
                extraction_plan,
                files,
                messages,
                lucydom_interface,
                ai_service,
                workflow_id,
                add_log_func
            )

        # Structure extracted data
        _log(add_log_func, workflow_id, f"Structuring extracted data from {len(extracted_data)} files", "info")
        structured_result = _structure_extracted_data(extracted_data, files, prompt)

        # Enhance with contextual summaries using AI
        if ai_service and structured_result["extracted_content"]:
            _log(add_log_func, workflow_id, "Creating contextual summaries for extracted content", "info")

            try:
                # Create a prompt for contextual summary
                summary_prompt = f"""
                Create concise, contextual summaries of the following extracted content according to this requirement:

                REQUIREMENT: {prompt}

                EXTRACTED CONTENT:
                """

                for item in structured_result["extracted_content"]:
                    file_name = item.get("name", "Unnamed file")
                    content_preview = item.get("content", "")[:500] + "..." if len(item.get("content", "")) > 500 else item.get("content", "")
                    summary_prompt += f"\n--- {file_name} ---\n{content_preview}\n"

                # Call AI for contextual summaries
                summaries = await ai_service.call_api([{"role": "user", "content": summary_prompt}])
                structured_result["contextual_summary"] = summaries

                _log(add_log_func, workflow_id, "Added contextual summaries to extracted data", "info")
            except Exception as e:
                _log(add_log_func, workflow_id, f"Error creating contextual summaries: {str(e)}", "warning")

        # Handle image-specific content separately
        image_content = [item for item in structured_result["extracted_content"]
                        if "Image Analysis" in item.get("content", "") or item.get("type") == "image"]

        if image_content and len(image_content) > 0:
            _log(add_log_func, workflow_id, f"Processing {len(image_content)} image-related content items", "info")

            # Add image analysis summary if we have AI service
            if ai_service:
                try:
                    # Create a prompt for image analysis summary
                    image_summary_prompt = f"""
                    Summarize the key visual information from these image analyses according to this requirement:

                    REQUIREMENT: {prompt}

                    IMAGE ANALYSES:
                    """

                    for item in image_content:
                        file_name = item.get("name", "Unnamed image")
                        content = item.get("content", "")
                        image_summary_prompt += f"\n--- {file_name} ---\n{content}\n"

                    # Call AI for image analysis summary
                    image_summaries = await ai_service.call_api([{"role": "user", "content": image_summary_prompt}])
                    structured_result["image_analysis_summary"] = image_summaries

                    _log(add_log_func, workflow_id, "Added image analysis summary to extracted data", "info")
                except Exception as e:
                    _log(add_log_func, workflow_id, f"Error creating image analysis summary: {str(e)}", "warning")

        return structured_result

    except Exception as e:
        logger.error(f"Error in data extraction: {str(e)}", exc_info=True)

        # Add error log
        if add_log_func and workflow_id:
            add_log_func(workflow_id, f"Data extraction error: {str(e)}", "error")

        # Return error result
        return {
            "error": str(e),
            "status": "error",
            "files_processed": len(files),
            "message": f"Data extraction failed: {str(e)}"
        }


async def _execute_extractions_with_handler(
    extraction_plan: List[Dict[str, Any]],
    files: List[Dict[str, Any]],
    messages: List[Dict[str, Any]],
    document_handler,
    ai_service,
    workflow_id: str = None,
    add_log_func = None
) -> List[Dict[str, Any]]:
    """
    Execute extractions using the document handler with enhanced image processing.

    Args:
        extraction_plan: List of extraction instructions
        files: List of all available files
        messages: List of all messages
        document_handler: Document handler for structured operations
        ai_service: Service for AI requests
        workflow_id: Optional workflow ID for logging
        add_log_func: Optional function for adding logs

    Returns:
        List with extracted data per file
    """
    extracted_data = []

    # Sort by importance (highest first)
    sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)

    for extraction_item in sorted_plan:
        file_id = extraction_item.get("file_id")
        extract_needed = extraction_item.get("extract_needed", False)
        extraction_prompt = extraction_item.get("extraction_prompt", "")

        # Find file metadata
        file_metadata = next((f for f in files if f.get("id") == file_id), None)

        if not file_metadata:
            logger.warning(f"File with ID {file_id} not found")
            continue

        file_name = file_metadata.get("name", "")
        file_type = file_metadata.get("type", "")
        content_type = file_metadata.get("content_type", "")

        # Log extraction start
        _log(add_log_func, workflow_id,
            f"Processing file: {file_name} (Extraction needed: {extract_needed})", "info")

        # Only perform extraction if needed
        if extract_needed:
            # Check if file already exists in messages with content
            existing_content = _find_document_in_messages(file_id, messages)

            if existing_content and existing_content.get("content"):
                # Content already exists, check if we need more specialized extraction
                current_context = existing_content.get("extraction_context", "")

                # Check if new extraction prompt is different or more specific
                if extraction_prompt and extraction_prompt != current_context:
                    _log(add_log_func, workflow_id,
                        f"Re-extracting {file_name} with new prompt: {extraction_prompt}", "info")

                    # Create an empty message to extract into
                    empty_message = {}

                    # Use document handler to extract with new context
                    try:
                        result_message = await document_handler.add_file_to_message(
                            empty_message,
                            file_id,
                            extraction_prompt
                        )

                        # Get the document content from result
                        if "documents" in result_message and result_message["documents"]:
                            doc = result_message["documents"][0]

                            # Get text content
                            content_text = ""
                            is_extracted = False

                            for content in doc.get("contents", []):
                                if content.get("type") == "text":
                                    content_text = content.get("text", "")
                                    is_extracted = content.get("is_extracted", False)
                                    break

                            # Create extraction result
                            extracted_data.append({
                                "file_id": file_id,
                                "name": file_name,
                                "type": file_type,
                                "content": content_text,
                                "is_extracted": is_extracted,
                                "extraction_method": "document_handler_reextract",
                                "extraction_context": extraction_prompt
                            })

                            # Check for additional documents (e.g., extracted images)
                            for additional_doc in result_message.get("documents", [])[1:]:
                                source = additional_doc.get("source", {})

                                # Skip if not an extracted document
                                if source.get("type") != "extracted":
                                    continue

                                # Get content
                                add_content_text = ""
                                add_is_extracted = False

                                for content in additional_doc.get("contents", []):
                                    if content.get("type") == "text":
                                        add_content_text = content.get("text", "")
                                        add_is_extracted = content.get("is_extracted", False)
                                        break

                                # Add as separate extraction result
                                if add_content_text:
                                    extracted_data.append({
                                        "file_id": source.get("id", f"extracted_{uuid.uuid4()}"),
                                        "name": source.get("name", f"Extracted from {file_name}"),
                                        "type": source.get("content_type", "image"),
                                        "content": add_content_text,
                                        "is_extracted": add_is_extracted,
                                        "extraction_method": "document_handler_extracted_component",
                                        "extraction_context": content.get("extraction_context", extraction_prompt),
                                        "parent_file_id": file_id
                                    })

                                    _log(add_log_func, workflow_id,
                                        f"Extracted embedded content from {file_name}", "info")

                            _log(add_log_func, workflow_id,
                                f"Re-extracted {file_name} with new context", "info")

                            continue
                    except Exception as e:
                        logger.error(f"Error re-extracting {file_name}: {str(e)}")
                        _log(add_log_func, workflow_id,
                            f"Error re-extracting {file_name}: {str(e)}", "warning")

                # Use existing content
                extracted_data.append({
                    "file_id": file_id,
                    "name": file_name,
                    "type": file_type,
                    "content": existing_content.get("content", ""),
                    "is_extracted": existing_content.get("is_extracted", False),
                    "extraction_method": "existing_content",
                    "extraction_context": current_context
                })

                _log(add_log_func, workflow_id,
                    f"Using existing content for {file_name}", "info")

                continue

            # Need to extract content with document handler
            try:
                # Create an empty message to extract into
                empty_message = {}

                # Use document handler to add file and extract content
                result_message = await document_handler.add_file_to_message(
                    empty_message,
                    file_id,
                    extraction_prompt
                )

                # Get the document content from result
                if "documents" in result_message and result_message["documents"]:
                    # Process main document
                    doc = result_message["documents"][0]  # First document is the main file

                    # Get text content
                    content_text = ""
                    is_extracted = False

                    for content in doc.get("contents", []):
                        if content.get("type") == "text":
                            content_text = content.get("text", "")
                            is_extracted = content.get("is_extracted", False)
                            break

                    # Create extraction result for main document
                    extracted_data.append({
                        "file_id": file_id,
                        "name": file_name,
                        "type": file_type,
                        "content": content_text,
                        "is_extracted": is_extracted,
                        "extraction_method": "document_handler",
                        "extraction_context": extraction_prompt
                    })

                    _log(add_log_func, workflow_id,
                        f"Extracted {file_name} using document handler", "info")

                    # Process additional documents (e.g., extracted images)
                    for additional_doc in result_message.get("documents", [])[1:]:
                        source = additional_doc.get("source", {})

                        # Skip if not an extracted document
                        if source.get("type") != "extracted":
                            continue

                        # Get content
                        add_content_text = ""
                        add_is_extracted = False

                        for content in additional_doc.get("contents", []):
                            if content.get("type") == "text":
                                add_content_text = content.get("text", "")
                                add_is_extracted = content.get("is_extracted", False)
                                break

                        # Add as separate extraction result
                        if add_content_text:
                            extracted_data.append({
                                "file_id": source.get("id", f"extracted_{uuid.uuid4()}"),
                                "name": source.get("name", f"Extracted from {file_name}"),
                                "type": source.get("content_type", "image"),
                                "content": add_content_text,
                                "is_extracted": add_is_extracted,
                                "extraction_method": "document_handler_extracted_component",
                                "extraction_context": content.get("extraction_context", extraction_prompt),
                                "parent_file_id": file_id
                            })

                            _log(add_log_func, workflow_id,
                                f"Extracted embedded content from {file_name}", "info")
                else:
                    # Extraction failed
                    extracted_data.append({
                        "file_id": file_id,
                        "name": file_name,
                        "type": file_type,
                        "content": f"Failed to extract content from {file_name}",
                        "is_extracted": False,
                        "extraction_method": "failed"
                    })

                    _log(add_log_func, workflow_id,
                        f"Failed to extract content from {file_name}", "warning")
            except Exception as e:
                logger.error(f"Error extracting {file_name}: {str(e)}")

                _log(add_log_func, workflow_id,
                    f"Error extracting {file_name}: {str(e)}", "warning")

                extracted_data.append({
                    "file_id": file_id,
                    "name": file_name,
                    "type": file_type,
                    "content": f"Error extracting: {str(e)}",
                    "is_extracted": False,
                    "extraction_method": "error"
                })
        else:
            # No extraction needed, use existing content
            existing_content = _find_document_in_messages(file_id, messages)

            if existing_content:
                extracted_data.append({
                    "file_id": file_id,
                    "name": file_name,
                    "type": file_type,
                    "content": existing_content.get("content", ""),
                    "is_extracted": existing_content.get("is_extracted", False),
                    "extraction_method": "existing_content",
                    "extraction_context": existing_content.get("extraction_context", "")
                })

                _log(add_log_func, workflow_id,
                    f"Using existing content for {file_name}", "info")
            else:
                # No existing content found
                extracted_data.append({
                    "file_id": file_id,
                    "name": file_name,
                    "type": file_type,
                    "content": f"No content available for {file_name}",
                    "is_extracted": False,
                    "extraction_method": "none"
                })

                _log(add_log_func, workflow_id,
                    f"No content available for {file_name}", "warning")

    return extracted_data


def _find_document_in_messages(file_id: int, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Find a document by file ID in workflow messages.

    Args:
        file_id: ID of the file to find
        messages: List of messages to search

    Returns:
        Dictionary with document information or empty dict if not found
    """
    for message in messages:
        for doc_index, document in enumerate(message.get("documents", [])):
            source = document.get("source", {})

            # Check if file ID matches
            if source.get("id") == str(file_id) or source.get("id") == file_id:
                # Found the document
                content_text = ""
                is_extracted = False

                # Look for text content
                for content in document.get("contents", []):
                    if content.get("type") == "text":
                        content_text = content.get("text", "")
                        is_extracted = content.get("is_extracted", False)
                        break

                return {
                    "document_id": document.get("id"),
                    "message_id": message.get("id"),
                    "content": content_text,
                    "is_extracted": is_extracted
                }

    return {}


async def _create_extraction_plan(
    prompt: str,
    files: List[Dict[str, Any]],
    messages: List[Dict[str, Any]],
    ai_service,
    workflow_id: str = None,
    add_log_func = None
) -> List[Dict[str, Any]]:
    """
    Erstellt einen Extraktionsplan mit AI-Unterstützung.

    Args:
        prompt: Spezifizierung, welche Daten extrahiert werden sollen
        files: Liste aller verfügbaren Dateien mit Metadaten
        messages: Liste aller Nachrichten im Workflow
        ai_service: Service für KI-Anfragen
        workflow_id: Optionale ID des Workflows für Logging
        add_log_func: Optionale Funktion für das Hinzufügen von Logs

    Returns:
        Extraktionsplan (Liste von Extraktionsanweisungen pro Datei)
    """
    # Erstelle Kontext-Informationen für den AI Call
    file_infos = []
    for file in files:
        # Basis-Metadaten
        file_info = {
            "id": file.get("id", ""),
            "name": file.get("name", ""),
            "type": file.get("type", ""),
            "content_type": file.get("content_type", ""),
            "size": file.get("size", "")
        }

        # Extraktionsstatus prüfen (falls vorhanden)
        doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)

        if doc_contents:
            # Prüfen, ob mindestens ein Content mit is_extracted=True existiert
            already_extracted = any(
                content.get("is_extracted", False) for content in doc_contents
            )
            file_info["already_extracted"] = already_extracted

            # Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar)
            for content in doc_contents:
                if content.get("type") == "text" and content.get("text"):
                    preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "")
                    file_info["content_preview"] = preview_text
                    break
        else:
            file_info["already_extracted"] = False

        file_infos.append(file_info)

    # AI-Prompt erstellen
    extraction_prompt = f"""
    Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien
    und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen.

    AUFGABE:
    {prompt}

    VERFÜGBARE DATEIEN:
    {json.dumps(file_infos, indent=2)}

    Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen:
    1. file_id: Die ID der zu extrahierenden Datei
    2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird)
    3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien)
    4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist)

    Format:
    [
      {{
        "file_id": 1234,
        "extract_needed": true,
        "extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen",
        "importance": 5
      }},
      ...
    ]

    Gib nur das JSON-Array zurück, ohne weitere Erklärungen.
    """

    # Log hinzufügen
    if add_log_func and workflow_id:
        add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info")

    try:
        # AI-Call durchführen
        extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])

        # JSON aus der Antwort extrahieren
        import re
        json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL)

        if json_match:
            extraction_plan = json.loads(json_match.group(0))

            # Log hinzufügen
            if add_log_func and workflow_id:
                add_log_func(
                    workflow_id,
                    f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien",
                    "info"
                )

            return extraction_plan
        else:
            # Fallback bei Parsing-Problemen
            if add_log_func and workflow_id:
                add_log_func(
                    workflow_id,
                    "Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan",
                    "warning"
                )

            # Standard-Plan: Alle nicht extrahierten Dateien extrahieren
            default_plan = []
            for file in files:
                doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
                already_extracted = any(
                    content.get("is_extracted", False) for content in doc_contents
                ) if doc_contents else False

                default_plan.append({
                    "file_id": file.get("id", 0),
                    "extract_needed": not already_extracted,
                    "extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}",
                    "importance": 3
                })

            return default_plan

    except Exception as e:
        logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True)

        if add_log_func and workflow_id:
            add_log_func(
                workflow_id,
                f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}",
                "error"
            )

        # Leerer Plan bei Fehlern
        return []

async def _execute_extractions(
    extraction_plan: List[Dict[str, Any]],
    files: List[Dict[str, Any]],
    messages: List[Dict[str, Any]],
    lucydom_interface,
    ai_service,
    workflow_id: str = None,
    add_log_func = None,
    logging_utils = None
) -> List[Dict[str, Any]]:
    """
    Execute the planned extractions.

    Args:
        extraction_plan: List of extraction instructions
        files: List of all available files
        lucydom_interface: Interface for database access
        ai_service: Service for AI requests
        workflow_id: Optional workflow ID for logging
        add_log_func: Optional function for adding logs
        logging_utils: Optional logging utility

    Returns:
        List with extracted data per file
    """
    extracted_data = []

    # Sort by importance
    sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)

    for extraction_item in sorted_plan:
        file_id = extraction_item.get("file_id")
        extract_needed = extraction_item.get("extract_needed", False)
        extraction_prompt = extraction_item.get("extraction_prompt", "")

        # Find file metadata
        file_metadata = next((f for f in files if f.get("id") == file_id), None)

        if not file_metadata:
            logger.warning(f"File with ID {file_id} not found")
            continue

        file_name = file_metadata.get("name", "")
        file_type = file_metadata.get("type", "")
        content_type = file_metadata.get("content_type", "")

        # Add log
        if logging_utils:
            logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction")
        elif add_log_func and workflow_id:
            add_log_func(
                workflow_id,
                f"Processing file: {file_name} (Extraction needed: {extract_needed})",
                "info"
            )

        # Only perform extraction if needed
        if extract_needed:
            # Get file content via LucyDOM interface
            if lucydom_interface:
                try:
                    file_content = await lucydom_interface.read_file_content(file_id)

                    if not file_content:
                        if logging_utils:
                            logging_utils.warning(f"File {file_name} not found", "extraction")
                        elif add_log_func and workflow_id:
                            add_log_func(workflow_id, f"File {file_name} not found", "warning")
                        continue

                    # Perform extraction based on file type
                    if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                        # Image analysis with AI service
                        if ai_service and hasattr(ai_service, "analyze_image"):
                            try:
                                image_analysis = await ai_service.analyze_image(
                                    image_data=file_content,
                                    prompt=extraction_prompt,
                                    mime_type=content_type
                                )

                                extracted_data.append({
                                    "file_id": file_id,
                                    "name": file_name,
                                    "type": file_type,
                                    "content": image_analysis,
                                    "is_extracted": True,
                                    "extraction_method": "image_analysis"
                                })

                                if logging_utils:
                                    logging_utils.info(f"Image {file_name} successfully analyzed", "extraction")
                                elif add_log_func and workflow_id:
                                    add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info")
                            except Exception as e:
                                logger.error(f"Error analyzing image {file_name}: {str(e)}")
                                if logging_utils:
                                    logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction")
                                elif add_log_func and workflow_id:
                                    add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
                        else:
                            # Fallback if no image analysis available
                            extracted_data.append({
                                "file_id": file_id,
                                "name": file_name,
                                "type": file_type,
                                "content": f"Image: {file_name} (Analysis not available)",
                                "is_extracted": False,
                                "extraction_method": "none"
                            })
                    else:
                        # Text-based extraction for all other file types
                        try:
                            # Import directly here to avoid circular imports
                            from modules.agentservice_utils import extract_text_from_file_content

                            content, is_extracted = extract_text_from_file_content(
                                file_content, file_name, content_type
                            )

                            extracted_data.append({
                                "file_id": file_id,
                                "name": file_name,
                                "type": file_type,
                                "content": content,
                                "is_extracted": is_extracted,
                                "extraction_method": "text_extraction"
                            })

                            if logging_utils:
                                logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction")
                            elif add_log_func and workflow_id:
                                add_log_func(
                                    workflow_id,
                                    f"File {file_name} extracted (Status: {is_extracted})",
                                    "info"
                                )
                        except Exception as e:
                            logger.error(f"Error extracting text from {file_name}: {str(e)}")
                            if logging_utils:
                                logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction")
                            elif add_log_func and workflow_id:
                                add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error")
                except Exception as e:
                    logger.error(f"Error reading file {file_name}: {str(e)}")
                    if logging_utils:
                        logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction")
                    elif add_log_func and workflow_id:
                        add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
            else:
                logger.warning(f"No LucyDOM interface available for file {file_name}")
                if logging_utils:
                    logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction")
                elif add_log_func and workflow_id:
                    add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning")
        else:
            # No extraction needed, use existing content
            doc_contents = _extract_document_contents_from_messages(file_id, messages)

            if doc_contents:
                # Use first text content
                for content in doc_contents:
                    if content.get("type") == "text":
                        extracted_data.append({
                            "file_id": file_id,
                            "name": file_name,
                            "type": file_type,
                            "content": content.get("text", ""),
                            "is_extracted": content.get("is_extracted", False),
                            "extraction_method": "existing_content"
                        })
                        break
            else:
                # No existing content found
                extracted_data.append({
                    "file_id": file_id,
                    "name": file_name,
                    "type": file_type,
                    "content": f"No content available for {file_name}",
                    "is_extracted": False,
                    "extraction_method": "none"
                })

    return extracted_data

def _structure_extracted_data(
    extracted_data: List[Dict[str, Any]],
    files: List[Dict[str, Any]],
    prompt: str
) -> Dict[str, Any]:
    """
    Structure the extracted data into a formatted result.

    Args:
        extracted_data: List of extracted data per file
        files: List of all available files
        prompt: Original extraction prompt

    Returns:
        Structured result object
    """
    # Create base structure
    result = {
        "prompt": prompt,
        "files_processed": len(extracted_data),
        "total_files": len(files),
        "extraction_timestamp": datetime.now().isoformat(),
        "status": "success",
        "extracted_content": []
    }

    # Add extracted content
    for data_item in extracted_data:
        # Enrich with file metadata
        file_id = data_item.get("file_id", 0)
        file_metadata = next((f for f in files if f.get("id") == file_id), {})

        content_item = {
            "file_id": file_id,
            "name": data_item.get("name", file_metadata.get("name", "")),
            "type": data_item.get("type", file_metadata.get("type", "")),
            "content_type": file_metadata.get("content_type", ""),
            "size": file_metadata.get("size", ""),
            "is_extracted": data_item.get("is_extracted", False),
            "extraction_method": data_item.get("extraction_method", ""),
            "content": data_item.get("content", "")
        }

        result["extracted_content"].append(content_item)

    return result

def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Extract document contents for a specific file from workflow messages.
    Enhanced to handle the new document structure.

    Args:
        file_id: ID of the file
        messages: List of all messages in the workflow

    Returns:
        List of document contents for the specified file
    """
    contents = []

    for message in messages:
        # Search documents in the message
        for document in message.get("documents", []):
            source = document.get("source", {})

            # Check if file ID matches (handle both string and int comparison)
            if (source.get("id") == file_id or
                (isinstance(source.get("id"), str) and source.get("id") == str(file_id)) or
                (isinstance(file_id, str) and source.get("id") == file_id)):

                # Add contents of the file
                doc_contents = document.get("contents", [])

                if doc_contents:
                    # Ensure each content has document reference
                    for content in doc_contents:
                        content_copy = content.copy()
                        content_copy["document_id"] = document.get("id")
                        content_copy["message_id"] = message.get("id")
                        contents.append(content_copy)

    return contents

def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None):
    """Helper function for logging with different log functions"""
    # Log via logger instance
    if log_type == "error":
        logger.error(message)
    elif log_type == "warning":
        logger.warning(message)
    else:
        logger.info(message)

    # Log via provided log function (if available)
    if add_log_func and workflow_id:
        add_log_func(workflow_id, message, log_type, agent_id, agent_name)