# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Unit tests for the P0 ingestion facade on KnowledgeService. Covers acceptance criteria AC4 (idempotent ingestion for unchanged content) and hash stability. The knowledge DB interface and AI embedding service are stubbed so the test runs without any external dependency. """ from unittest.mock import MagicMock, patch import pytest from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import ( IngestionJob, KnowledgeService, _computeIngestionHash, ) class _StubKnowledgeDb: """Minimal in-memory stand-in for interfaceDbKnowledge.""" def __init__(self): self.index = None self.upsertIndexCalls = 0 self.upsertChunkCalls = 0 def upsertFileContentIndex(self, index): self.index = index.model_dump() if hasattr(index, "model_dump") else dict(index) self.upsertIndexCalls += 1 def upsertContentChunk(self, chunk): self.upsertChunkCalls += 1 def updateFileStatus(self, fileId, status): if self.index is not None: self.index["status"] = status def getFileContentIndex(self, fileId): return self.index def _makeService(): """Create a KnowledgeService with stubbed db and ai dependencies.""" stubDb = _StubKnowledgeDb() aiService = MagicMock() async def _callEmbedding(texts): return MagicMock( errorCount=0, content="", metadata={"embeddings": [[0.0] * 4 for _ in texts]}, ) aiService.callEmbedding = _callEmbedding def getService(name): if name == "ai": return aiService raise KeyError(name) context = MagicMock() context.user = MagicMock() # Return a non-empty but empty-dict record so the FileItem lookup branch # in _indexFileInternal resolves without touching a real DB. context.interfaceDbComponent = MagicMock() context.interfaceDbComponent.getRecordset = MagicMock(return_value=[{}]) with patch( "modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge.getKnowledgeInterface", return_value=stubDb, ): service = KnowledgeService(context, getService) return service, stubDb @pytest.mark.asyncio async def test_duplicate_skipped(): service, db = _makeService() job = IngestionJob( sourceKind="file", sourceId="file-123", fileName="a.txt", mimeType="text/plain", userId="u1", contentObjects=[ {"contentObjectId": "c1", "contentType": "text", "data": "hello world"} ], ) first = await service.requestIngestion(job) assert first.status == "indexed" chunksAfterFirst = db.upsertChunkCalls assert chunksAfterFirst >= 1 second = await service.requestIngestion(job) assert second.status == "duplicate" assert second.contentHash == first.contentHash # No additional embedding work. assert db.upsertChunkCalls == chunksAfterFirst @pytest.mark.asyncio async def test_reindex_on_content_change(): service, db = _makeService() base = IngestionJob( sourceKind="file", sourceId="file-123", fileName="a.txt", mimeType="text/plain", userId="u1", contentObjects=[ {"contentObjectId": "c1", "contentType": "text", "data": "hello world"} ], ) first = await service.requestIngestion(base) assert first.status == "indexed" chunksAfterFirst = db.upsertChunkCalls changed = IngestionJob( sourceKind="file", sourceId="file-123", fileName="a.txt", mimeType="text/plain", userId="u1", contentObjects=[ {"contentObjectId": "c1", "contentType": "text", "data": "hello universe"} ], ) second = await service.requestIngestion(changed) assert second.status == "indexed" assert second.contentHash != first.contentHash assert db.upsertChunkCalls > chunksAfterFirst def test_hash_stable_under_reordering(): a = [ {"contentObjectId": "c1", "contentType": "text", "data": "alpha"}, {"contentObjectId": "c2", "contentType": "text", "data": "beta"}, ] b = list(reversed(a)) assert _computeIngestionHash(a) == _computeIngestionHash(b) def test_hash_changes_on_data_edit(): a = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha"}] b = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha!"}] assert _computeIngestionHash(a) != _computeIngestionHash(b) @pytest.mark.asyncio async def test_get_ingestion_status_after_index(): service, _db = _makeService() job = IngestionJob( sourceKind="coaching_session", sourceId="coaching-session:abc", fileName="session", mimeType="application/x-coaching-session", userId="u1", contentObjects=[ {"contentObjectId": "m0", "contentType": "text", "data": "User: hi"} ], provenance={"lane": "feature", "feature": "commcoach"}, ) handle = await service.requestIngestion(job) status = service.getIngestionStatus(handle) assert status["status"] == "indexed" assert status["sourceKind"] == "coaching_session" assert status["contentHash"] == handle.contentHash