#!/usr/bin/env python3 # Copyright (c) 2025 Patrick Motsch # All rights reserved. """Test that _computeIngestionHash is stable across re-extractions of the same source. Extractors generate fresh contentObjectIds (uuid.uuid4()) per run. The ingestion hash MUST therefore be derived from content (contentType + data + order) only — otherwise idempotency (AC4) silently fails: every re-extraction looks "new" and triggers full re-embedding. """ import os import sys import uuid sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import ( _computeIngestionHash, ) def _makeObjects(seed: str = "alpha"): """Build a synthetic contentObjects list as routeDataFiles._autoIndexFile would.""" return [ { "contentObjectId": str(uuid.uuid4()), "contentType": "text", "data": f"Page 1 of {seed}", }, { "contentObjectId": str(uuid.uuid4()), "contentType": "text", "data": f"Page 2 of {seed}", }, { "contentObjectId": str(uuid.uuid4()), "contentType": "binary", "data": "", }, ] def test_hash_stable_across_uuid_regeneration(): """Same content + different contentObjectIds → same hash.""" a = _makeObjects("alpha") b = _makeObjects("alpha") # identical data, fresh UUIDs assert [o["contentObjectId"] for o in a] != [o["contentObjectId"] for o in b] assert _computeIngestionHash(a) == _computeIngestionHash(b) def test_hash_changes_when_data_changes(): a = _makeObjects("alpha") b = _makeObjects("beta") assert _computeIngestionHash(a) != _computeIngestionHash(b) def test_hash_is_order_sensitive(): """Reordered pages produce a different hash (different document).""" a = _makeObjects("alpha") b = list(reversed(a)) assert _computeIngestionHash(a) != _computeIngestionHash(b) def test_hash_distinguishes_text_vs_binary_with_same_payload(): a = [{"contentObjectId": "x", "contentType": "text", "data": "hello"}] b = [{"contentObjectId": "x", "contentType": "binary", "data": "hello"}] assert _computeIngestionHash(a) != _computeIngestionHash(b) def test_hash_handles_empty_input(): assert _computeIngestionHash([]) == _computeIngestionHash([]) if __name__ == "__main__": test_hash_stable_across_uuid_regeneration() test_hash_changes_when_data_changes() test_hash_is_order_sensitive() test_hash_distinguishes_text_vs_binary_with_same_payload() test_hash_handles_empty_input() print("OK — all 5 ingestion-hash stability tests passed")