gateway/tests/unit/services/test_ingestion_hash_stability.py

#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Test that _computeIngestionHash is stable across re-extractions of the same source.

Extractors generate fresh contentObjectIds (uuid.uuid4()) per run. The ingestion
hash MUST therefore be derived from content (contentType + data + order) only —
otherwise idempotency (AC4) silently fails: every re-extraction looks "new" and
triggers full re-embedding.
"""

import os
import sys
import uuid

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))

from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import (
    _computeIngestionHash,
)


def _makeObjects(seed: str = "alpha"):
    """Build a synthetic contentObjects list as routeDataFiles._autoIndexFile would."""
    return [
        {
            "contentObjectId": str(uuid.uuid4()),
            "contentType": "text",
            "data": f"Page 1 of {seed}",
        },
        {
            "contentObjectId": str(uuid.uuid4()),
            "contentType": "text",
            "data": f"Page 2 of {seed}",
        },
        {
            "contentObjectId": str(uuid.uuid4()),
            "contentType": "binary",
            "data": "<image-bytes-as-b64>",
        },
    ]


def test_hash_stable_across_uuid_regeneration():
    """Same content + different contentObjectIds → same hash."""
    a = _makeObjects("alpha")
    b = _makeObjects("alpha")  # identical data, fresh UUIDs
    assert [o["contentObjectId"] for o in a] != [o["contentObjectId"] for o in b]
    assert _computeIngestionHash(a) == _computeIngestionHash(b)


def test_hash_changes_when_data_changes():
    a = _makeObjects("alpha")
    b = _makeObjects("beta")
    assert _computeIngestionHash(a) != _computeIngestionHash(b)


def test_hash_is_order_sensitive():
    """Reordered pages produce a different hash (different document)."""
    a = _makeObjects("alpha")
    b = list(reversed(a))
    assert _computeIngestionHash(a) != _computeIngestionHash(b)


def test_hash_distinguishes_text_vs_binary_with_same_payload():
    a = [{"contentObjectId": "x", "contentType": "text", "data": "hello"}]
    b = [{"contentObjectId": "x", "contentType": "binary", "data": "hello"}]
    assert _computeIngestionHash(a) != _computeIngestionHash(b)


def test_hash_handles_empty_input():
    assert _computeIngestionHash([]) == _computeIngestionHash([])


if __name__ == "__main__":
    test_hash_stable_across_uuid_regeneration()
    test_hash_changes_when_data_changes()
    test_hash_is_order_sensitive()
    test_hash_distinguishes_text_vs_binary_with_same_payload()
    test_hash_handles_empty_input()
    print("OK — all 5 ingestion-hash stability tests passed")