gateway/modules/workflows/methods/methodContext/actions/extractContent.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

"""context.extractContent — extracts content without AI.

Returns a unified handover compatible with AiResult-style downstream wiring:

- ``documents[0]``: structured JSON (`context.extractContent.handover.v1`); image ``parts``
  keep metadata but omit pixel data; each dropped image references
  ``handoverMediaDocumentName`` matching a sibling blob document.
- ``documents[1:]``: each extracted image as its own binary ``ActionDocument`` (like
  ``ai.process`` artefact outputs).
- ``ActionResult.data["response"]`` plus normalized executor field ``response``: concatenated
  plain text from all text parts — safe default for ``file.create`` / primaryTextRef."""

import base64 as _b64
import binascii as _binascii
import logging
import re
import time
from typing import Any, Dict, List, Tuple

from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelDocref import coerceDocumentReferenceList
from modules.datamodels.datamodelExtraction import ContentExtracted, ExtractionOptions

logger = logging.getLogger(__name__)

_UNSAFE_FILE_KEY = re.compile(r"[^\w\-.\(\)\[\]%@+]")

HANDOVER_KIND = "context.extractContent.handover.v1"


def _default_extraction_options() -> ExtractionOptions:
    """No merge — keep all parts for downstream JSON selection."""
    return ExtractionOptions(
        prompt="Extract all content from the document",
        mergeStrategy=None,
        processDocumentsIndividually=True,
        outputFormat="parts",
        outputDetail="full",
    )


def _file_json_key(display_name: str, index: int, key_counts: Dict[str, int]) -> str:
    stem = (display_name or "").strip() or f"document_{index + 1}"
    slug = stem.replace("/", "_").replace("\\", "_").replace(" ", "_")
    slug = _UNSAFE_FILE_KEY.sub("_", slug).strip("_") or f"document_{index + 1}"
    base = f"file_{index + 1}_{slug}"
    n = key_counts.get(base, 0)
    key_counts[base] = n + 1
    return base if n == 0 else f"{base}__{n}"


def _serialize_parts(parts: Any) -> List[Dict[str, Any]]:
    out: List[Dict[str, Any]] = []
    for p in parts or []:
        if hasattr(p, "model_dump"):
            out.append(p.model_dump(mode="json"))
        elif isinstance(p, dict):
            out.append(dict(p))
    return out


def _rebuild_by_type_group(parts_ser: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    by_type: Dict[str, List[Dict[str, Any]]] = {}
    for entry in parts_ser:
        if not isinstance(entry, dict):
            continue
        tg = (entry.get("typeGroup") or "").strip() or "_other"
        by_type.setdefault(tg, []).append(entry)
    return by_type


def _joined_text_from_handover_payload(payload: Dict[str, Any]) -> str:
    """Concatenate text parts across fileOrder for AiResult-compatible ``response``."""
    files_section = payload.get("files") or {}
    ordered = payload.get("fileOrder")
    keys: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys())
    chunks: List[str] = []
    for fk in keys:
        bucket = files_section.get(fk)
        if not isinstance(bucket, dict):
            continue
        for p in bucket.get("parts") or []:
            if not isinstance(p, dict):
                continue
            if (p.get("typeGroup") or "").strip() != "text":
                continue
            raw = p.get("data")
            if raw is None:
                continue
            s = str(raw).strip()
            if s:
                chunks.append(s)
    return "\n\n".join(chunks)


def _mime_to_file_extension(mime: str) -> str:
    m = (mime or "").split(";")[0].strip().lower()
    mapping = {
        "image/jpeg": "jpg",
        "image/jpg": "jpg",
        "image/png": "png",
        "image/gif": "gif",
        "image/webp": "webp",
        "image/bmp": "bmp",
        "image/tiff": "tiff",
    }
    return mapping.get(m, m.rsplit("/", 1)[-1] if "/" in m else "bin")


def _split_images_to_sidecar_documents(
    payload: Dict[str, Any],
    *,
    document_name_stem: str,
) -> Tuple[Dict[str, Any], List[ActionDocument]]:
    """
    Deep-copy handover JSON, clear image pixel data from ``parts``, attach
    ``handoverMediaDocumentName`` on each image part, emit binary ActionDocuments.
    """
    import copy

    bundle = copy.deepcopy(payload)
    files_section = bundle.get("files") or {}
    ordered = bundle.get("fileOrder")
    key_order: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys())
    media_docs: List[ActionDocument] = []
    kind = bundle.get("kind") or HANDOVER_KIND

    stem = re.sub(r"[^\w\-]+", "_", document_name_stem).strip("_") or "extract"

    for fk in key_order:
        bucket = files_section.get(fk)
        if not isinstance(bucket, dict):
            continue
        parts = bucket.get("parts")
        if not isinstance(parts, list):
            continue
        new_parts: List[Dict[str, Any]] = []
        for p in parts:
            if not isinstance(p, dict):
                new_parts.append(p)
                continue
            pcopy = dict(p)
            tg = (pcopy.get("typeGroup") or "").strip()
            mime = (pcopy.get("mimeType") or "").strip()
            raw_data = pcopy.get("data")
            if tg == "image" and mime.lower().startswith("image/") and raw_data:
                raw_s = raw_data.strip() if isinstance(raw_data, str) else ""
                try:
                    blob = _b64.b64decode(raw_s, validate=True) if raw_s else b""
                except (_binascii.Error, TypeError, ValueError) as e:
                    logger.warning(
                        "extractContent: could not decode image part %s (keep inline): %s",
                        pcopy.get("id"),
                        e,
                    )
                    new_parts.append(pcopy)
                    continue
                if not blob:
                    new_parts.append(pcopy)
                    continue
                part_id = str(pcopy.get("id") or "part")
                # Full part id (UUID) — must not truncate or names collide / break linking
                safe_id = re.sub(r"[^\w\-.]+", "_", part_id).strip("_") or "media"
                if len(safe_id) > 200:
                    safe_id = safe_id[:200]
                ext = _mime_to_file_extension(mime)
                media_name = f"extract_media_{stem}_{safe_id}.{ext}"
                pcopy["data"] = ""
                pcopy["handoverMediaDocumentName"] = media_name
                media_docs.append(
                    ActionDocument(
                        documentName=media_name,
                        documentData=blob,
                        mimeType=mime,
                        validationMetadata={
                            "actionType": "context.extractContent",
                            "handoverRole": "extractedMedia",
                            "sourcePartId": part_id,
                            "handoverSchema": kind,
                            "containerFileKey": fk,
                        },
                    )
                )
                new_parts.append(pcopy)
            else:
                new_parts.append(pcopy)
        bucket["parts"] = new_parts
        bucket["byTypeGroup"] = _rebuild_by_type_group(new_parts)
        files_section[fk] = bucket

    return bundle, media_docs


def _one_file_bucket(ec: ContentExtracted, source_file_name: str) -> Dict[str, Any]:
    parts_ser = _serialize_parts(ec.parts)

    ud = getattr(ec, "udm", None)
    if hasattr(ud, "model_dump"):
        ud = ud.model_dump(mode="json")

    summary = getattr(ec, "summary", None)
    if hasattr(summary, "model_dump"):
        summary = summary.model_dump(mode="json")
    elif isinstance(summary, dict):
        summary = dict(summary)
    elif summary is None:
        summary = {}

    return {
        "sourceFileName": source_file_name,
        "extractedId": getattr(ec, "id", ""),
        "summary": summary,
        "udm": ud,
        "parts": parts_ser,
        "byTypeGroup": _rebuild_by_type_group(parts_ser),
    }


def build_extract_content_handover(
    *,
    extracted_results: List[ContentExtracted],
    chat_file_names: List[str],
    operation_ref: str,
) -> Dict[str, Any]:
    key_counts: Dict[str, int] = {}
    files: Dict[str, Any] = {}
    ordered: List[str] = []

    for i, ec in enumerate(extracted_results):
        name = chat_file_names[i] if i < len(chat_file_names) else ""
        fk = _file_json_key(str(name), i, key_counts)
        files[fk] = _one_file_bucket(ec, str(name))
        ordered.append(fk)

    return {
        "schemaVersion": 1,
        "kind": HANDOVER_KIND,
        "operationRef": operation_ref,
        "fileOrder": ordered,
        "files": files,
    }


async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult:
    operation_id = None
    try:
        wf = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
        operation_id = f"context_extract_{wf}_{int(time.time())}"

        document_list_param = parameters.get("documentList")
        if not document_list_param:
            return ActionResult.isFailure(error="documentList is required")

        dl = coerceDocumentReferenceList(document_list_param)
        if not dl.references:
            return ActionResult.isFailure(
                error=(
                    f"documentList could not be parsed (type={type(document_list_param).__name__}); "
                    "expected DocumentReferenceList, list of strings/dicts, or "
                    "a wrapper dict like {'documents': [...]}"
                ),
            )

        parent_operation_id = parameters.get("parentOperationId")
        self.services.chat.progressLogStart(
            operation_id,
            "Extracting content from documents",
            "Content Extraction",
            f"Documents: {len(dl.references)}",
            parentOperationId=parent_operation_id,
        )

        self.services.chat.progressLogUpdate(operation_id, 0.2, "Loading documents")
        chat_documents = self.services.chat.getChatDocumentsFromDocumentList(dl)
        if not chat_documents:
            self.services.chat.progressLogFinish(operation_id, False)
            return ActionResult.isFailure(error="No documents found in documentList")

        logger.info(f"Extracting JSON handover from {len(chat_documents)} documents")

        self.services.chat.progressLogUpdate(operation_id, 0.3, "Preparing extraction options")

        eo_param = parameters.get("extractionOptions")
        extraction_options: ExtractionOptions
        if isinstance(eo_param, dict) and eo_param:
            eo = dict(eo_param)
            eo.setdefault("prompt", "Extract all content from the document")
            if "mergeStrategy" not in eo:
                eo["mergeStrategy"] = None
            try:
                extraction_options = ExtractionOptions(**eo)
            except Exception as e:
                logger.warning(f"Invalid extractionOptions, using defaults: {e}")
                extraction_options = _default_extraction_options()
        elif isinstance(eo_param, ExtractionOptions):
            extraction_options = eo_param
        else:
            extraction_options = _default_extraction_options()

        self.services.chat.progressLogUpdate(operation_id, 0.4, "Extracting …")
        self.services.chat.progressLogUpdate(operation_id, 0.5, f"Extracting {len(chat_documents)} document(s)")
        extracted_results = self.services.extraction.extractContent(chat_documents, extraction_options, operationId=operation_id)

        file_names = [getattr(cd, "fileName", "") or "" for cd in chat_documents]

        payload = build_extract_content_handover(
            extracted_results=extracted_results,
            chat_file_names=file_names,
            operation_ref=operation_id,
        )

        self.services.chat.progressLogUpdate(operation_id, 0.9, "Building JSON")

        stem = f"{wf}_{int(time.time())}"
        stripped_payload, media_docs = _split_images_to_sidecar_documents(
            payload,
            document_name_stem=stem,
        )
        joined_text = _joined_text_from_handover_payload(payload)

        json_meta = {
            "actionType": "context.extractContent",
            "documentCountInput": len(chat_documents),
            "documentCountRoots": len(extracted_results),
            "handoverSchema": stripped_payload.get("kind"),
            "handoverRole": "structuredHandover",
            "mediaDocumentCount": len(media_docs),
        }

        json_doc = ActionDocument(
            documentName=f"extracted_content_{stem}.json",
            documentData=stripped_payload,
            mimeType="application/json",
            validationMetadata=json_meta,
        )

        handover_data = {
            "response": joined_text,
            "contentType": "text",
            "handoverKind": stripped_payload.get("kind"),
            "structuredDocumentIndex": 0,
            "mediaDocumentCount": len(media_docs),
        }

        self.services.chat.progressLogFinish(operation_id, True)
        return ActionResult.isSuccess(documents=[json_doc] + media_docs, data=handover_data)

    except Exception as e:
        logger.error(f"Error in content extraction: {str(e)}")
        try:
            if operation_id:
                self.services.chat.progressLogFinish(operation_id, False)
        except Exception:
            pass
        return ActionResult.isFailure(error=str(e))