#!/usr/bin/env python3 # Copyright (c) 2025 Patrick Motsch # All rights reserved. """Bootstrap SharePoint tests with a fake adapter + knowledge service. Verifies: - Every discovered file triggers `requestIngestion`. - Duplicate runs (same eTag revisions) report `skippedDuplicate`. - Synthetic fileIds are stable across runs so idempotency works end-to-end. """ import asyncio import os import sys from dataclasses import dataclass from types import SimpleNamespace from typing import Any, Dict, List, Optional sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import ( bootstrapSharepoint, _syntheticFileId, ) @dataclass class _ExtEntry: name: str path: str isFolder: bool = False size: Optional[int] = None mimeType: Optional[str] = None metadata: Dict[str, Any] = None class _FakeSpAdapter: """Minimal SharepointAdapter stand-in. Layout: "/" → 1 site "/sites/site-1" → 2 files (f1, f2) + 1 folder (sub) "/sites/site-1/sub" → 1 file (f3) """ def __init__(self): self.downloaded: List[str] = [] async def browse(self, path: str, filter=None, limit=None): if path == "/": return [ _ExtEntry( name="Site 1", path="/sites/site-1", isFolder=True, metadata={"id": "site-1"}, ), ] if path == "/sites/site-1": return [ _ExtEntry( name="f1.txt", path="/sites/site-1/f1.txt", mimeType="text/plain", size=20, metadata={"id": "f1", "revision": "etag-f1"}, ), _ExtEntry( name="f2.txt", path="/sites/site-1/f2.txt", mimeType="text/plain", size=20, metadata={"id": "f2", "revision": "etag-f2"}, ), _ExtEntry( name="sub", path="/sites/site-1/sub", isFolder=True, metadata={"id": "sub"}, ), ] if path == "/sites/site-1/sub": return [ _ExtEntry( name="f3.txt", path="/sites/site-1/sub/f3.txt", mimeType="text/plain", size=20, metadata={"id": "f3", "revision": "etag-f3"}, ), ] return [] async def download(self, path: str) -> bytes: self.downloaded.append(path) return path.encode("utf-8") class _FakeKnowledgeService: """Records requestIngestion calls and returns the scripted handles.""" def __init__(self, duplicateIds=None): self.calls: List[SimpleNamespace] = [] self._duplicateIds = duplicateIds or set() async def requestIngestion(self, job): self.calls.append(job) status = "duplicate" if job.sourceId in self._duplicateIds else "indexed" return SimpleNamespace( jobId=f"{job.sourceKind}:{job.sourceId}", status=status, contentHash="h", fileId=job.sourceId, index=None, error=None, ) def _fakeRunExtraction(data, name, mime, options): """Produce a single synthetic text part so `_toContentObjects` returns one.""" return SimpleNamespace( parts=[ SimpleNamespace( id="p1", data=data.decode("utf-8") if isinstance(data, bytes) else str(data), typeGroup="text", label="page:1", metadata={"pageIndex": 0}, ) ] ) def test_bootstrap_walks_sites_and_subfolders(): adapter = _FakeSpAdapter() knowledge = _FakeKnowledgeService() connection = SimpleNamespace(mandateId="m1", userId="u1") async def _run(): return await bootstrapSharepoint( connectionId="c1", adapter=adapter, connection=connection, knowledgeService=knowledge, runExtractionFn=_fakeRunExtraction, ) result = asyncio.run(_run()) assert len(knowledge.calls) == 3 sourceIds = {c.sourceId for c in knowledge.calls} assert sourceIds == { _syntheticFileId("c1", "f1"), _syntheticFileId("c1", "f2"), _syntheticFileId("c1", "f3"), } assert result["indexed"] == 3 assert result["skippedDuplicate"] == 0 assert adapter.downloaded == [ "/sites/site-1/f1.txt", "/sites/site-1/f2.txt", "/sites/site-1/sub/f3.txt", ] def test_bootstrap_reports_duplicates_on_second_run(): adapter = _FakeSpAdapter() duplicateIds = { _syntheticFileId("c1", "f1"), _syntheticFileId("c1", "f2"), _syntheticFileId("c1", "f3"), } knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds) connection = SimpleNamespace(mandateId="m1", userId="u1") async def _run(): return await bootstrapSharepoint( connectionId="c1", adapter=adapter, connection=connection, knowledgeService=knowledge, runExtractionFn=_fakeRunExtraction, ) result = asyncio.run(_run()) assert result["indexed"] == 0 assert result["skippedDuplicate"] == 3 def test_bootstrap_passes_connection_provenance(): adapter = _FakeSpAdapter() knowledge = _FakeKnowledgeService() connection = SimpleNamespace(mandateId="m1", userId="u1") async def _run(): return await bootstrapSharepoint( connectionId="c1", adapter=adapter, connection=connection, knowledgeService=knowledge, runExtractionFn=_fakeRunExtraction, ) asyncio.run(_run()) for job in knowledge.calls: assert job.sourceKind == "sharepoint_item" assert job.mandateId == "m1" assert job.provenance["connectionId"] == "c1" assert job.provenance["authority"] == "msft" assert job.provenance["service"] == "sharepoint" assert job.contentVersion and job.contentVersion.startswith("etag-") if __name__ == "__main__": test_bootstrap_walks_sites_and_subfolders() test_bootstrap_reports_duplicates_on_second_run() test_bootstrap_passes_connection_provenance() print("OK — bootstrapSharepoint tests passed")