172 lines
5.2 KiB
Python
172 lines
5.2 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Unit tests for the P0 ingestion facade on KnowledgeService.
|
|
|
|
Covers acceptance criteria AC4 (idempotent ingestion for unchanged content)
|
|
and hash stability. The knowledge DB interface and AI embedding service are
|
|
stubbed so the test runs without any external dependency.
|
|
"""
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import (
|
|
IngestionJob,
|
|
KnowledgeService,
|
|
_computeIngestionHash,
|
|
)
|
|
|
|
|
|
class _StubKnowledgeDb:
|
|
"""Minimal in-memory stand-in for interfaceDbKnowledge."""
|
|
|
|
def __init__(self):
|
|
self.index = None
|
|
self.upsertIndexCalls = 0
|
|
self.upsertChunkCalls = 0
|
|
|
|
def upsertFileContentIndex(self, index):
|
|
self.index = index.model_dump() if hasattr(index, "model_dump") else dict(index)
|
|
self.upsertIndexCalls += 1
|
|
|
|
def upsertContentChunk(self, chunk):
|
|
self.upsertChunkCalls += 1
|
|
|
|
def updateFileStatus(self, fileId, status):
|
|
if self.index is not None:
|
|
self.index["status"] = status
|
|
|
|
def getFileContentIndex(self, fileId):
|
|
return self.index
|
|
|
|
|
|
def _makeService():
|
|
"""Create a KnowledgeService with stubbed db and ai dependencies."""
|
|
stubDb = _StubKnowledgeDb()
|
|
|
|
aiService = MagicMock()
|
|
|
|
async def _callEmbedding(texts):
|
|
return MagicMock(
|
|
errorCount=0,
|
|
content="",
|
|
metadata={"embeddings": [[0.0] * 4 for _ in texts]},
|
|
)
|
|
|
|
aiService.callEmbedding = _callEmbedding
|
|
|
|
def getService(name):
|
|
if name == "ai":
|
|
return aiService
|
|
raise KeyError(name)
|
|
|
|
context = MagicMock()
|
|
context.user = MagicMock()
|
|
# Return a non-empty but empty-dict record so the FileItem lookup branch
|
|
# in _indexFileInternal resolves without touching a real DB.
|
|
context.interfaceDbComponent = MagicMock()
|
|
context.interfaceDbComponent.getRecordset = MagicMock(return_value=[{}])
|
|
|
|
with patch(
|
|
"modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge.getKnowledgeInterface",
|
|
return_value=stubDb,
|
|
):
|
|
service = KnowledgeService(context, getService)
|
|
|
|
return service, stubDb
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_duplicate_skipped():
|
|
service, db = _makeService()
|
|
job = IngestionJob(
|
|
sourceKind="file",
|
|
sourceId="file-123",
|
|
fileName="a.txt",
|
|
mimeType="text/plain",
|
|
userId="u1",
|
|
contentObjects=[
|
|
{"contentObjectId": "c1", "contentType": "text", "data": "hello world"}
|
|
],
|
|
)
|
|
|
|
first = await service.requestIngestion(job)
|
|
assert first.status == "indexed"
|
|
chunksAfterFirst = db.upsertChunkCalls
|
|
assert chunksAfterFirst >= 1
|
|
|
|
second = await service.requestIngestion(job)
|
|
assert second.status == "duplicate"
|
|
assert second.contentHash == first.contentHash
|
|
# No additional embedding work.
|
|
assert db.upsertChunkCalls == chunksAfterFirst
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reindex_on_content_change():
|
|
service, db = _makeService()
|
|
base = IngestionJob(
|
|
sourceKind="file",
|
|
sourceId="file-123",
|
|
fileName="a.txt",
|
|
mimeType="text/plain",
|
|
userId="u1",
|
|
contentObjects=[
|
|
{"contentObjectId": "c1", "contentType": "text", "data": "hello world"}
|
|
],
|
|
)
|
|
first = await service.requestIngestion(base)
|
|
assert first.status == "indexed"
|
|
chunksAfterFirst = db.upsertChunkCalls
|
|
|
|
changed = IngestionJob(
|
|
sourceKind="file",
|
|
sourceId="file-123",
|
|
fileName="a.txt",
|
|
mimeType="text/plain",
|
|
userId="u1",
|
|
contentObjects=[
|
|
{"contentObjectId": "c1", "contentType": "text", "data": "hello universe"}
|
|
],
|
|
)
|
|
second = await service.requestIngestion(changed)
|
|
assert second.status == "indexed"
|
|
assert second.contentHash != first.contentHash
|
|
assert db.upsertChunkCalls > chunksAfterFirst
|
|
|
|
|
|
def test_hash_stable_under_reordering():
|
|
a = [
|
|
{"contentObjectId": "c1", "contentType": "text", "data": "alpha"},
|
|
{"contentObjectId": "c2", "contentType": "text", "data": "beta"},
|
|
]
|
|
b = list(reversed(a))
|
|
assert _computeIngestionHash(a) == _computeIngestionHash(b)
|
|
|
|
|
|
def test_hash_changes_on_data_edit():
|
|
a = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha"}]
|
|
b = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha!"}]
|
|
assert _computeIngestionHash(a) != _computeIngestionHash(b)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_ingestion_status_after_index():
|
|
service, _db = _makeService()
|
|
job = IngestionJob(
|
|
sourceKind="coaching_session",
|
|
sourceId="coaching-session:abc",
|
|
fileName="session",
|
|
mimeType="application/x-coaching-session",
|
|
userId="u1",
|
|
contentObjects=[
|
|
{"contentObjectId": "m0", "contentType": "text", "data": "User: hi"}
|
|
],
|
|
provenance={"lane": "feature", "feature": "commcoach"},
|
|
)
|
|
handle = await service.requestIngestion(job)
|
|
status = service.getIngestionStatus(handle)
|
|
assert status["status"] == "indexed"
|
|
assert status["sourceKind"] == "coaching_session"
|
|
assert status["contentHash"] == handle.contentHash
|