Re-indexing the same file always triggered a full embedding run — ingestion.skipped.duplicate never fired. Two independent causes: 1. _computeIngestionHash included contentObjectId in its payload, but extractors generate fresh uuid4() per run, making the hash a per-run nonce. Now hashed over (contentType, data) in extractor order — stable across re-extractions, sensitive to content, ordering, and type changes. 2. _autoIndexFile upserted the fresh pre-scan FileContentIndex before requestIngestion's duplicate check, wiping structure._ingestion and status=indexed from the prior run. The pre-upsert now merges the existing _ingestion metadata and preserves the indexed status. Verified end-to-end: second PATCH /scope on an already-indexed file logs and returns in ~2s with zero embedding API calls. Adds test_ingestion_hash_stability.py (5 cases).
81 lines
2.7 KiB
Python
81 lines
2.7 KiB
Python
#!/usr/bin/env python3
|
|
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Test that _computeIngestionHash is stable across re-extractions of the same source.
|
|
|
|
Extractors generate fresh contentObjectIds (uuid.uuid4()) per run. The ingestion
|
|
hash MUST therefore be derived from content (contentType + data + order) only —
|
|
otherwise idempotency (AC4) silently fails: every re-extraction looks "new" and
|
|
triggers full re-embedding.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import uuid
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
|
|
|
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import (
|
|
_computeIngestionHash,
|
|
)
|
|
|
|
|
|
def _makeObjects(seed: str = "alpha"):
|
|
"""Build a synthetic contentObjects list as routeDataFiles._autoIndexFile would."""
|
|
return [
|
|
{
|
|
"contentObjectId": str(uuid.uuid4()),
|
|
"contentType": "text",
|
|
"data": f"Page 1 of {seed}",
|
|
},
|
|
{
|
|
"contentObjectId": str(uuid.uuid4()),
|
|
"contentType": "text",
|
|
"data": f"Page 2 of {seed}",
|
|
},
|
|
{
|
|
"contentObjectId": str(uuid.uuid4()),
|
|
"contentType": "binary",
|
|
"data": "<image-bytes-as-b64>",
|
|
},
|
|
]
|
|
|
|
|
|
def test_hash_stable_across_uuid_regeneration():
|
|
"""Same content + different contentObjectIds → same hash."""
|
|
a = _makeObjects("alpha")
|
|
b = _makeObjects("alpha") # identical data, fresh UUIDs
|
|
assert [o["contentObjectId"] for o in a] != [o["contentObjectId"] for o in b]
|
|
assert _computeIngestionHash(a) == _computeIngestionHash(b)
|
|
|
|
|
|
def test_hash_changes_when_data_changes():
|
|
a = _makeObjects("alpha")
|
|
b = _makeObjects("beta")
|
|
assert _computeIngestionHash(a) != _computeIngestionHash(b)
|
|
|
|
|
|
def test_hash_is_order_sensitive():
|
|
"""Reordered pages produce a different hash (different document)."""
|
|
a = _makeObjects("alpha")
|
|
b = list(reversed(a))
|
|
assert _computeIngestionHash(a) != _computeIngestionHash(b)
|
|
|
|
|
|
def test_hash_distinguishes_text_vs_binary_with_same_payload():
|
|
a = [{"contentObjectId": "x", "contentType": "text", "data": "hello"}]
|
|
b = [{"contentObjectId": "x", "contentType": "binary", "data": "hello"}]
|
|
assert _computeIngestionHash(a) != _computeIngestionHash(b)
|
|
|
|
|
|
def test_hash_handles_empty_input():
|
|
assert _computeIngestionHash([]) == _computeIngestionHash([])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_hash_stable_across_uuid_regeneration()
|
|
test_hash_changes_when_data_changes()
|
|
test_hash_is_order_sensitive()
|
|
test_hash_distinguishes_text_vs_binary_with_same_payload()
|
|
test_hash_handles_empty_input()
|
|
print("OK — all 5 ingestion-hash stability tests passed")
|