- connection.established/revoked callbacks from OAuth routes and connection management endpoints - KnowledgeIngestionConsumer dispatches bootstrap job (established) and synchronous purge (revoked) - FileContentIndex: add connectionId + sourceKind columns - SharePoint bootstrap with @odata.nextLink pagination and eTag-based idempotency - Outlook bootstrap treats messages as virtual documents with cleanEmailBody for HTML/quote/signature stripping - fix(rag): lower buildAgentContext minScore thresholds from 0.55/0.65/0.70 to 0.35 — previous values blocked all real matches from text-embedding-3-small - 24 new unit tests covering purge, consumer dispatch, email cleaning and both bootstrap paths
209 lines
6.3 KiB
Python
209 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Bootstrap SharePoint tests with a fake adapter + knowledge service.
|
|
|
|
Verifies:
|
|
- Every discovered file triggers `requestIngestion`.
|
|
- Duplicate runs (same eTag revisions) report `skippedDuplicate`.
|
|
- Synthetic fileIds are stable across runs so idempotency works end-to-end.
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from types import SimpleNamespace
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
|
|
|
|
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
|
|
bootstrapSharepoint,
|
|
_syntheticFileId,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class _ExtEntry:
|
|
name: str
|
|
path: str
|
|
isFolder: bool = False
|
|
size: Optional[int] = None
|
|
mimeType: Optional[str] = None
|
|
metadata: Dict[str, Any] = None
|
|
|
|
|
|
class _FakeSpAdapter:
|
|
"""Minimal SharepointAdapter stand-in.
|
|
|
|
Layout:
|
|
"/" → 1 site
|
|
"/sites/site-1" → 2 files (f1, f2) + 1 folder (sub)
|
|
"/sites/site-1/sub" → 1 file (f3)
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.downloaded: List[str] = []
|
|
|
|
async def browse(self, path: str, filter=None, limit=None):
|
|
if path == "/":
|
|
return [
|
|
_ExtEntry(
|
|
name="Site 1",
|
|
path="/sites/site-1",
|
|
isFolder=True,
|
|
metadata={"id": "site-1"},
|
|
),
|
|
]
|
|
if path == "/sites/site-1":
|
|
return [
|
|
_ExtEntry(
|
|
name="f1.txt", path="/sites/site-1/f1.txt",
|
|
mimeType="text/plain", size=20,
|
|
metadata={"id": "f1", "revision": "etag-f1"},
|
|
),
|
|
_ExtEntry(
|
|
name="f2.txt", path="/sites/site-1/f2.txt",
|
|
mimeType="text/plain", size=20,
|
|
metadata={"id": "f2", "revision": "etag-f2"},
|
|
),
|
|
_ExtEntry(
|
|
name="sub", path="/sites/site-1/sub",
|
|
isFolder=True, metadata={"id": "sub"},
|
|
),
|
|
]
|
|
if path == "/sites/site-1/sub":
|
|
return [
|
|
_ExtEntry(
|
|
name="f3.txt", path="/sites/site-1/sub/f3.txt",
|
|
mimeType="text/plain", size=20,
|
|
metadata={"id": "f3", "revision": "etag-f3"},
|
|
),
|
|
]
|
|
return []
|
|
|
|
async def download(self, path: str) -> bytes:
|
|
self.downloaded.append(path)
|
|
return path.encode("utf-8")
|
|
|
|
|
|
class _FakeKnowledgeService:
|
|
"""Records requestIngestion calls and returns the scripted handles."""
|
|
|
|
def __init__(self, duplicateIds=None):
|
|
self.calls: List[SimpleNamespace] = []
|
|
self._duplicateIds = duplicateIds or set()
|
|
|
|
async def requestIngestion(self, job):
|
|
self.calls.append(job)
|
|
status = "duplicate" if job.sourceId in self._duplicateIds else "indexed"
|
|
return SimpleNamespace(
|
|
jobId=f"{job.sourceKind}:{job.sourceId}",
|
|
status=status,
|
|
contentHash="h",
|
|
fileId=job.sourceId,
|
|
index=None,
|
|
error=None,
|
|
)
|
|
|
|
|
|
def _fakeRunExtraction(data, name, mime, options):
|
|
"""Produce a single synthetic text part so `_toContentObjects` returns one."""
|
|
return SimpleNamespace(
|
|
parts=[
|
|
SimpleNamespace(
|
|
id="p1",
|
|
data=data.decode("utf-8") if isinstance(data, bytes) else str(data),
|
|
typeGroup="text",
|
|
label="page:1",
|
|
metadata={"pageIndex": 0},
|
|
)
|
|
]
|
|
)
|
|
|
|
|
|
def test_bootstrap_walks_sites_and_subfolders():
|
|
adapter = _FakeSpAdapter()
|
|
knowledge = _FakeKnowledgeService()
|
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
|
|
|
async def _run():
|
|
return await bootstrapSharepoint(
|
|
connectionId="c1",
|
|
adapter=adapter,
|
|
connection=connection,
|
|
knowledgeService=knowledge,
|
|
runExtractionFn=_fakeRunExtraction,
|
|
)
|
|
|
|
result = asyncio.run(_run())
|
|
assert len(knowledge.calls) == 3
|
|
sourceIds = {c.sourceId for c in knowledge.calls}
|
|
assert sourceIds == {
|
|
_syntheticFileId("c1", "f1"),
|
|
_syntheticFileId("c1", "f2"),
|
|
_syntheticFileId("c1", "f3"),
|
|
}
|
|
assert result["indexed"] == 3
|
|
assert result["skippedDuplicate"] == 0
|
|
assert adapter.downloaded == [
|
|
"/sites/site-1/f1.txt",
|
|
"/sites/site-1/f2.txt",
|
|
"/sites/site-1/sub/f3.txt",
|
|
]
|
|
|
|
|
|
def test_bootstrap_reports_duplicates_on_second_run():
|
|
adapter = _FakeSpAdapter()
|
|
duplicateIds = {
|
|
_syntheticFileId("c1", "f1"),
|
|
_syntheticFileId("c1", "f2"),
|
|
_syntheticFileId("c1", "f3"),
|
|
}
|
|
knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds)
|
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
|
|
|
async def _run():
|
|
return await bootstrapSharepoint(
|
|
connectionId="c1",
|
|
adapter=adapter,
|
|
connection=connection,
|
|
knowledgeService=knowledge,
|
|
runExtractionFn=_fakeRunExtraction,
|
|
)
|
|
|
|
result = asyncio.run(_run())
|
|
assert result["indexed"] == 0
|
|
assert result["skippedDuplicate"] == 3
|
|
|
|
|
|
def test_bootstrap_passes_connection_provenance():
|
|
adapter = _FakeSpAdapter()
|
|
knowledge = _FakeKnowledgeService()
|
|
connection = SimpleNamespace(mandateId="m1", userId="u1")
|
|
|
|
async def _run():
|
|
return await bootstrapSharepoint(
|
|
connectionId="c1",
|
|
adapter=adapter,
|
|
connection=connection,
|
|
knowledgeService=knowledge,
|
|
runExtractionFn=_fakeRunExtraction,
|
|
)
|
|
|
|
asyncio.run(_run())
|
|
for job in knowledge.calls:
|
|
assert job.sourceKind == "sharepoint_item"
|
|
assert job.mandateId == "m1"
|
|
assert job.provenance["connectionId"] == "c1"
|
|
assert job.provenance["authority"] == "msft"
|
|
assert job.provenance["service"] == "sharepoint"
|
|
assert job.contentVersion and job.contentVersion.startswith("etag-")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_bootstrap_walks_sites_and_subfolders()
|
|
test_bootstrap_reports_duplicates_on_second_run()
|
|
test_bootstrap_passes_connection_provenance()
|
|
print("OK — bootstrapSharepoint tests passed")
|