gateway/tests/unit/services/test_knowledge_ingest_consumer.py
Ida 6a5ff1ff7c feat(rag): P1 user-connection hooks + retrieval threshold fix
- connection.established/revoked callbacks from OAuth routes and
  connection management endpoints
- KnowledgeIngestionConsumer dispatches bootstrap job (established)
  and synchronous purge (revoked)
- FileContentIndex: add connectionId + sourceKind columns
- SharePoint bootstrap with @odata.nextLink pagination and eTag-based
  idempotency
- Outlook bootstrap treats messages as virtual documents with
  cleanEmailBody for HTML/quote/signature stripping
- fix(rag): lower buildAgentContext minScore thresholds from
  0.55/0.65/0.70 to 0.35 — previous values blocked all real matches
  from text-embedding-3-small
- 24 new unit tests covering purge, consumer dispatch, email cleaning
  and both bootstrap paths
2026-04-29 14:39:40 +02:00

172 lines
5.5 KiB
Python

#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Unit tests for KnowledgeIngestionConsumer event dispatch.
- `connection.established` → enqueue a `connection.bootstrap` job.
- `connection.revoked` → synchronous purge via KnowledgeObjects.
"""
import asyncio
import os
import sys
import types
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as consumer
def _resetRegistration(monkeypatch):
"""Force the module-level guard to register fresh in each test."""
monkeypatch.setattr(consumer, "_registered", False)
def test_onConnectionEstablished_enqueues_bootstrap(monkeypatch):
startedJobs = []
async def _fakeStartJob(jobType, payload, **kwargs):
startedJobs.append({"jobType": jobType, "payload": payload, "kwargs": kwargs})
return "job-1"
monkeypatch.setattr(consumer, "startJob", _fakeStartJob)
consumer._onConnectionEstablished(
connectionId="c1", authority="msft", userId="u1"
)
# Drain pending tasks created by the consumer.
loop = asyncio.new_event_loop()
try:
asyncio.set_event_loop(loop)
# If the consumer created a Task on a closed loop the fake startJob
# was still called synchronously via asyncio.run — in either case we
# check the recorded call.
finally:
loop.close()
assert len(startedJobs) == 1
assert startedJobs[0]["jobType"] == consumer.BOOTSTRAP_JOB_TYPE
assert startedJobs[0]["payload"]["connectionId"] == "c1"
assert startedJobs[0]["payload"]["authority"] == "msft"
assert startedJobs[0]["kwargs"]["triggeredBy"] == "u1"
def test_onConnectionEstablished_ignores_missing_id(monkeypatch):
called = []
async def _fakeStartJob(*a, **kw):
called.append(1)
return "x"
monkeypatch.setattr(consumer, "startJob", _fakeStartJob)
consumer._onConnectionEstablished(connectionId="", authority="msft")
assert called == []
def test_onConnectionRevoked_runs_sync_purge(monkeypatch):
class _FakeKnowledge:
def __init__(self):
self.calls = []
def deleteFileContentIndexByConnectionId(self, cid):
self.calls.append(cid)
return {"indexRows": 2, "chunks": 5}
fakeKnow = _FakeKnowledge()
def _fakeGetInterface(_user=None):
return fakeKnow
monkeypatch.setattr(consumer, "getKnowledgeInterface", _fakeGetInterface)
consumer._onConnectionRevoked(
connectionId="c1", authority="msft", userId="u1", reason="disconnected"
)
assert fakeKnow.calls == ["c1"]
def test_onConnectionRevoked_ignores_missing_id(monkeypatch):
seen = []
def _fakeGetInterface(_user=None):
class _K:
def deleteFileContentIndexByConnectionId(self, cid):
seen.append(cid)
return {"indexRows": 0, "chunks": 0}
return _K()
monkeypatch.setattr(consumer, "getKnowledgeInterface", _fakeGetInterface)
consumer._onConnectionRevoked(connectionId="")
assert seen == []
def test_bootstrap_job_skips_non_pilot_authority(monkeypatch):
async def _run():
result = await consumer._bootstrapJobHandler(
{"payload": {"connectionId": "c1", "authority": "google"}},
lambda *_: None,
)
return result
result = asyncio.run(_run())
assert result["skipped"] is True
assert result["authority"] == "google"
def test_bootstrap_job_dispatches_msft_parts(monkeypatch):
calls = {"sp": 0, "ol": 0}
async def _fakeSp(connectionId, progressCb=None):
calls["sp"] += 1
return {"indexed": 1}
async def _fakeOl(connectionId, progressCb=None):
calls["ol"] += 1
return {"indexed": 2}
# subConnectorSync* are lazy-imported inside the handler; install fake
# modules before invoking.
fakeSharepoint = types.ModuleType("subConnectorSyncSharepoint")
fakeSharepoint.bootstrapSharepoint = _fakeSp
fakeOutlook = types.ModuleType("subConnectorSyncOutlook")
fakeOutlook.bootstrapOutlook = _fakeOl
monkeypatch.setitem(
sys.modules,
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint",
fakeSharepoint,
)
monkeypatch.setitem(
sys.modules,
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook",
fakeOutlook,
)
async def _run():
return await consumer._bootstrapJobHandler(
{"payload": {"connectionId": "c1", "authority": "msft"}},
lambda *_: None,
)
result = asyncio.run(_run())
assert calls == {"sp": 1, "ol": 1}
assert result["sharepoint"] == {"indexed": 1}
assert result["outlook"] == {"indexed": 2}
if __name__ == "__main__":
# Usable without pytest fixtures for a quick smoke run.
class _MP:
def __init__(self):
self.undos = []
def setattr(self, target, name_or_value, value=None):
if value is None:
# target is an object, name_or_value is value → no, original signature
raise SystemExit("use pytest monkeypatch in CLI")
self.undos.append((target, name_or_value, getattr(target, name_or_value)))
setattr(target, name_or_value, value)
def setitem(self, mapping, key, value):
self.undos.append((mapping, key, mapping.get(key)))
mapping[key] = value
print("Run via pytest: pytest tests/unit/services/test_knowledge_ingest_consumer.py")