gateway/tests/unit/workflow/test_extract_content_handover.py

# Unit tests: unified extractContent handover (text vs image sidecars).

import base64

from modules.workflows.methods.methodContext.actions.extractContent import (
    HANDOVER_KIND,
    _apply_content_filter,
    _joined_text_from_handover_payload,
    _split_images_to_sidecar_documents,
)


def test_joined_text_orders_text_table_and_skips_container():
    payload = {
        "kind": HANDOVER_KIND,
        "fileOrder": ["f1"],
        "files": {
            "f1": {
                "parts": [
                    {"typeGroup": "text", "data": "  A\n", "id": "x"},
                    {"typeGroup": "container", "data": "", "id": "c"},
                    {"typeGroup": "text", "data": "B", "id": "y"},
                ]
            }
        },
    }
    assert _joined_text_from_handover_payload(payload) == "A\n\nB"


def test_joined_text_includes_csv_table_parts():
    payload = {
        "fileOrder": ["f1"],
        "files": {
            "f1": {
                "parts": [
                    {"typeGroup": "table", "mimeType": "text/csv", "data": "a,b\n1,2", "id": "t"},
                ]
            }
        },
    }
    assert _joined_text_from_handover_payload(payload) == "a,b\n1,2"


def test_split_images_moves_pixels_to_blob_docs():
    raw = b"fake-binary-image"
    b64 = base64.b64encode(raw).decode("ascii")
    payload = {
        "kind": HANDOVER_KIND,
        "schemaVersion": 1,
        "fileOrder": ["f1"],
        "files": {
            "f1": {
                "parts": [
                    {"typeGroup": "text", "data": "x", "id": "t1"},
                    {
                        "typeGroup": "image",
                        "mimeType": "image/png",
                        "data": b64,
                        "id": "p1-img",
                        "metadata": {},
                    },
                ]
            }
        },
    }
    stripped, blobs = _split_images_to_sidecar_documents(payload, document_name_stem="abc")
    assert len(blobs) == 1
    assert blobs[0].mimeType == "image/png"
    assert blobs[0].documentData == raw
    assert blobs[0].documentName.endswith(".png")
    assert blobs[0].documentName.startswith("extract_media_")
    meta = blobs[0].validationMetadata or {}
    assert meta.get("handoverRole") == "extractedMedia"
    img_parts = [
        p
        for p in stripped["files"]["f1"]["parts"]
        if isinstance(p, dict) and (p.get("typeGroup") or "") == "image"
    ]
    assert len(img_parts) == 1
    assert img_parts[0]["data"] == ""
    assert img_parts[0]["handoverMediaDocumentName"] == blobs[0].documentName
    assert "image" in stripped["files"]["f1"]["byTypeGroup"]


def _mixed_payload():
    return {
        "kind": HANDOVER_KIND,
        "schemaVersion": 1,
        "fileOrder": ["f1"],
        "files": {
            "f1": {
                "parts": [
                    {"typeGroup": "text", "data": "hello", "id": "t1"},
                    {"typeGroup": "table", "mimeType": "text/csv", "data": "a,b", "id": "tb1"},
                    {"typeGroup": "image", "mimeType": "image/png", "data": "abc=", "id": "i1"},
                    {"typeGroup": "structure", "mimeType": "text/html", "data": "<p/>", "id": "s1"},
                ],
            }
        },
    }


def test_content_filter_all_is_noop():
    payload = _mixed_payload()
    result = _apply_content_filter(payload, "all")
    assert result is payload  # same object, no copy


def test_content_filter_text_only_keeps_text_table_structure():
    result = _apply_content_filter(_mixed_payload(), "textOnly")
    parts = result["files"]["f1"]["parts"]
    type_groups = {p["typeGroup"] for p in parts}
    assert type_groups == {"text", "table", "structure"}
    assert "image" not in type_groups


def test_content_filter_images_only():
    result = _apply_content_filter(_mixed_payload(), "imagesOnly")
    parts = result["files"]["f1"]["parts"]
    assert all(p["typeGroup"] == "image" for p in parts)
    assert len(parts) == 1


def test_content_filter_no_images_removes_only_images():
    result = _apply_content_filter(_mixed_payload(), "noImages")
    parts = result["files"]["f1"]["parts"]
    type_groups = {p["typeGroup"] for p in parts}
    assert "image" not in type_groups
    # text, table, structure all remain
    assert {"text", "table", "structure"} == type_groups


def test_content_filter_text_only_joined_text_has_no_image_data():
    result = _apply_content_filter(_mixed_payload(), "textOnly")
    text = _joined_text_from_handover_payload(result)
    assert "hello" in text
    assert "abc=" not in text  # base64 image data must not appear


def test_content_filter_text_only_no_sidecars():
    """textOnly: no image parts → _split produces zero sidecars."""
    result = _apply_content_filter(_mixed_payload(), "textOnly")
    stripped, blobs = _split_images_to_sidecar_documents(result, document_name_stem="test")
    assert blobs == []