gateway/tests/unit/services/test_bootstrap_sharepoint.py
Ida 6a5ff1ff7c feat(rag): P1 user-connection hooks + retrieval threshold fix
- connection.established/revoked callbacks from OAuth routes and
  connection management endpoints
- KnowledgeIngestionConsumer dispatches bootstrap job (established)
  and synchronous purge (revoked)
- FileContentIndex: add connectionId + sourceKind columns
- SharePoint bootstrap with @odata.nextLink pagination and eTag-based
  idempotency
- Outlook bootstrap treats messages as virtual documents with
  cleanEmailBody for HTML/quote/signature stripping
- fix(rag): lower buildAgentContext minScore thresholds from
  0.55/0.65/0.70 to 0.35 — previous values blocked all real matches
  from text-embedding-3-small
- 24 new unit tests covering purge, consumer dispatch, email cleaning
  and both bootstrap paths
2026-04-29 14:39:40 +02:00

209 lines
6.3 KiB
Python

#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Bootstrap SharePoint tests with a fake adapter + knowledge service.
Verifies:
- Every discovered file triggers `requestIngestion`.
- Duplicate runs (same eTag revisions) report `skippedDuplicate`.
- Synthetic fileIds are stable across runs so idempotency works end-to-end.
"""
import asyncio
import os
import sys
from dataclasses import dataclass
from types import SimpleNamespace
from typing import Any, Dict, List, Optional
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
bootstrapSharepoint,
_syntheticFileId,
)
@dataclass
class _ExtEntry:
name: str
path: str
isFolder: bool = False
size: Optional[int] = None
mimeType: Optional[str] = None
metadata: Dict[str, Any] = None
class _FakeSpAdapter:
"""Minimal SharepointAdapter stand-in.
Layout:
"/" → 1 site
"/sites/site-1" → 2 files (f1, f2) + 1 folder (sub)
"/sites/site-1/sub" → 1 file (f3)
"""
def __init__(self):
self.downloaded: List[str] = []
async def browse(self, path: str, filter=None, limit=None):
if path == "/":
return [
_ExtEntry(
name="Site 1",
path="/sites/site-1",
isFolder=True,
metadata={"id": "site-1"},
),
]
if path == "/sites/site-1":
return [
_ExtEntry(
name="f1.txt", path="/sites/site-1/f1.txt",
mimeType="text/plain", size=20,
metadata={"id": "f1", "revision": "etag-f1"},
),
_ExtEntry(
name="f2.txt", path="/sites/site-1/f2.txt",
mimeType="text/plain", size=20,
metadata={"id": "f2", "revision": "etag-f2"},
),
_ExtEntry(
name="sub", path="/sites/site-1/sub",
isFolder=True, metadata={"id": "sub"},
),
]
if path == "/sites/site-1/sub":
return [
_ExtEntry(
name="f3.txt", path="/sites/site-1/sub/f3.txt",
mimeType="text/plain", size=20,
metadata={"id": "f3", "revision": "etag-f3"},
),
]
return []
async def download(self, path: str) -> bytes:
self.downloaded.append(path)
return path.encode("utf-8")
class _FakeKnowledgeService:
"""Records requestIngestion calls and returns the scripted handles."""
def __init__(self, duplicateIds=None):
self.calls: List[SimpleNamespace] = []
self._duplicateIds = duplicateIds or set()
async def requestIngestion(self, job):
self.calls.append(job)
status = "duplicate" if job.sourceId in self._duplicateIds else "indexed"
return SimpleNamespace(
jobId=f"{job.sourceKind}:{job.sourceId}",
status=status,
contentHash="h",
fileId=job.sourceId,
index=None,
error=None,
)
def _fakeRunExtraction(data, name, mime, options):
"""Produce a single synthetic text part so `_toContentObjects` returns one."""
return SimpleNamespace(
parts=[
SimpleNamespace(
id="p1",
data=data.decode("utf-8") if isinstance(data, bytes) else str(data),
typeGroup="text",
label="page:1",
metadata={"pageIndex": 0},
)
]
)
def test_bootstrap_walks_sites_and_subfolders():
adapter = _FakeSpAdapter()
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapSharepoint(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
runExtractionFn=_fakeRunExtraction,
)
result = asyncio.run(_run())
assert len(knowledge.calls) == 3
sourceIds = {c.sourceId for c in knowledge.calls}
assert sourceIds == {
_syntheticFileId("c1", "f1"),
_syntheticFileId("c1", "f2"),
_syntheticFileId("c1", "f3"),
}
assert result["indexed"] == 3
assert result["skippedDuplicate"] == 0
assert adapter.downloaded == [
"/sites/site-1/f1.txt",
"/sites/site-1/f2.txt",
"/sites/site-1/sub/f3.txt",
]
def test_bootstrap_reports_duplicates_on_second_run():
adapter = _FakeSpAdapter()
duplicateIds = {
_syntheticFileId("c1", "f1"),
_syntheticFileId("c1", "f2"),
_syntheticFileId("c1", "f3"),
}
knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds)
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapSharepoint(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
runExtractionFn=_fakeRunExtraction,
)
result = asyncio.run(_run())
assert result["indexed"] == 0
assert result["skippedDuplicate"] == 3
def test_bootstrap_passes_connection_provenance():
adapter = _FakeSpAdapter()
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapSharepoint(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
runExtractionFn=_fakeRunExtraction,
)
asyncio.run(_run())
for job in knowledge.calls:
assert job.sourceKind == "sharepoint_item"
assert job.mandateId == "m1"
assert job.provenance["connectionId"] == "c1"
assert job.provenance["authority"] == "msft"
assert job.provenance["service"] == "sharepoint"
assert job.contentVersion and job.contentVersion.startswith("etag-")
if __name__ == "__main__":
test_bootstrap_walks_sites_and_subfolders()
test_bootstrap_reports_duplicates_on_second_run()
test_bootstrap_passes_connection_provenance()
print("OK — bootstrapSharepoint tests passed")