gateway/tests/unit/serviceKnowledge/test_requestIngestion.py
2026-04-29 14:39:40 +02:00

172 lines
5.2 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Unit tests for the P0 ingestion facade on KnowledgeService.
Covers acceptance criteria AC4 (idempotent ingestion for unchanged content)
and hash stability. The knowledge DB interface and AI embedding service are
stubbed so the test runs without any external dependency.
"""
from unittest.mock import MagicMock, patch
import pytest
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import (
IngestionJob,
KnowledgeService,
_computeIngestionHash,
)
class _StubKnowledgeDb:
"""Minimal in-memory stand-in for interfaceDbKnowledge."""
def __init__(self):
self.index = None
self.upsertIndexCalls = 0
self.upsertChunkCalls = 0
def upsertFileContentIndex(self, index):
self.index = index.model_dump() if hasattr(index, "model_dump") else dict(index)
self.upsertIndexCalls += 1
def upsertContentChunk(self, chunk):
self.upsertChunkCalls += 1
def updateFileStatus(self, fileId, status):
if self.index is not None:
self.index["status"] = status
def getFileContentIndex(self, fileId):
return self.index
def _makeService():
"""Create a KnowledgeService with stubbed db and ai dependencies."""
stubDb = _StubKnowledgeDb()
aiService = MagicMock()
async def _callEmbedding(texts):
return MagicMock(
errorCount=0,
content="",
metadata={"embeddings": [[0.0] * 4 for _ in texts]},
)
aiService.callEmbedding = _callEmbedding
def getService(name):
if name == "ai":
return aiService
raise KeyError(name)
context = MagicMock()
context.user = MagicMock()
# Return a non-empty but empty-dict record so the FileItem lookup branch
# in _indexFileInternal resolves without touching a real DB.
context.interfaceDbComponent = MagicMock()
context.interfaceDbComponent.getRecordset = MagicMock(return_value=[{}])
with patch(
"modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge.getKnowledgeInterface",
return_value=stubDb,
):
service = KnowledgeService(context, getService)
return service, stubDb
@pytest.mark.asyncio
async def test_duplicate_skipped():
service, db = _makeService()
job = IngestionJob(
sourceKind="file",
sourceId="file-123",
fileName="a.txt",
mimeType="text/plain",
userId="u1",
contentObjects=[
{"contentObjectId": "c1", "contentType": "text", "data": "hello world"}
],
)
first = await service.requestIngestion(job)
assert first.status == "indexed"
chunksAfterFirst = db.upsertChunkCalls
assert chunksAfterFirst >= 1
second = await service.requestIngestion(job)
assert second.status == "duplicate"
assert second.contentHash == first.contentHash
# No additional embedding work.
assert db.upsertChunkCalls == chunksAfterFirst
@pytest.mark.asyncio
async def test_reindex_on_content_change():
service, db = _makeService()
base = IngestionJob(
sourceKind="file",
sourceId="file-123",
fileName="a.txt",
mimeType="text/plain",
userId="u1",
contentObjects=[
{"contentObjectId": "c1", "contentType": "text", "data": "hello world"}
],
)
first = await service.requestIngestion(base)
assert first.status == "indexed"
chunksAfterFirst = db.upsertChunkCalls
changed = IngestionJob(
sourceKind="file",
sourceId="file-123",
fileName="a.txt",
mimeType="text/plain",
userId="u1",
contentObjects=[
{"contentObjectId": "c1", "contentType": "text", "data": "hello universe"}
],
)
second = await service.requestIngestion(changed)
assert second.status == "indexed"
assert second.contentHash != first.contentHash
assert db.upsertChunkCalls > chunksAfterFirst
def test_hash_stable_under_reordering():
a = [
{"contentObjectId": "c1", "contentType": "text", "data": "alpha"},
{"contentObjectId": "c2", "contentType": "text", "data": "beta"},
]
b = list(reversed(a))
assert _computeIngestionHash(a) == _computeIngestionHash(b)
def test_hash_changes_on_data_edit():
a = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha"}]
b = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha!"}]
assert _computeIngestionHash(a) != _computeIngestionHash(b)
@pytest.mark.asyncio
async def test_get_ingestion_status_after_index():
service, _db = _makeService()
job = IngestionJob(
sourceKind="coaching_session",
sourceId="coaching-session:abc",
fileName="session",
mimeType="application/x-coaching-session",
userId="u1",
contentObjects=[
{"contentObjectId": "m0", "contentType": "text", "data": "User: hi"}
],
provenance={"lane": "feature", "feature": "commcoach"},
)
handle = await service.requestIngestion(job)
status = service.getIngestionStatus(handle)
assert status["status"] == "indexed"
assert status["sourceKind"] == "coaching_session"
assert status["contentHash"] == handle.contentHash