# Copyright (c) 2025 Patrick Motsch # All rights reserved. """SharePoint bootstrap for the unified knowledge ingestion lane. Walks the SharePoint drive(s) reachable via a UserConnection, downloads each file-like item, runs the standard content extraction pipeline and hands the result to `KnowledgeService.requestIngestion`. Idempotency is provided by the ingestion façade itself; repeat bootstraps therefore produce `ingestion.skipped.duplicate` for every unchanged item because we pass the Graph `eTag` as `contentVersion`. """ from __future__ import annotations import asyncio import hashlib import logging import time from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional from modules.datamodels.datamodelExtraction import ExtractionOptions logger = logging.getLogger(__name__) MAX_ITEMS_DEFAULT = 500 MAX_BYTES_DEFAULT = 200 * 1024 * 1024 MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024 SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/") MAX_DEPTH_DEFAULT = 4 MAX_SITES_DEFAULT = 3 @dataclass class SharepointBootstrapLimits: maxItems: int = MAX_ITEMS_DEFAULT maxBytes: int = MAX_BYTES_DEFAULT maxFileSize: int = MAX_FILE_SIZE_DEFAULT skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT maxDepth: int = MAX_DEPTH_DEFAULT maxSites: int = MAX_SITES_DEFAULT # Pass-through to IngestionJob.neutralize neutralize: bool = False @dataclass class SharepointBootstrapResult: connectionId: str indexed: int = 0 skippedDuplicate: int = 0 skippedPolicy: int = 0 failed: int = 0 bytesProcessed: int = 0 errors: List[str] = field(default_factory=list) def _syntheticFileId(connectionId: str, externalItemId: str) -> str: """Deterministic synthetic FileContentIndex id for a SharePoint item. Stable across bootstraps → idempotency works; independent of file name so moves/renames don't duplicate chunks. """ token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16] return f"sp:{connectionId[:8]}:{token}" def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]: """Translate ExtractionResult → content objects accepted by requestIngestion.""" parts = getattr(extracted, "parts", None) or [] out: List[Dict[str, Any]] = [] for part in parts: data = getattr(part, "data", None) or "" if not data or not str(data).strip(): continue typeGroup = getattr(part, "typeGroup", "text") or "text" contentType = "text" if typeGroup == "image": contentType = "image" elif typeGroup in ("binary", "container"): contentType = "other" out.append({ "contentObjectId": getattr(part, "id", ""), "contentType": contentType, "data": data, "contextRef": { "containerPath": fileName, "location": getattr(part, "label", None) or "file", **(getattr(part, "metadata", None) or {}), }, }) return out async def bootstrapSharepoint( connectionId: str, *, progressCb: Optional[Callable[[int, Optional[str]], None]] = None, adapter: Any = None, connection: Any = None, knowledgeService: Any = None, limits: Optional[SharepointBootstrapLimits] = None, runExtractionFn: Optional[Callable[..., Any]] = None, ) -> Dict[str, Any]: """Enumerate SharePoint drives and ingest every reachable file via the façade. Parameters allow injection for tests; production callers pass only `connectionId` (and optionally a progressCb) and everything else is resolved against the registered services. """ from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs prefs = loadConnectionPrefs(connectionId) if not limits: limits = SharepointBootstrapLimits(neutralize=prefs.neutralizeBeforeEmbed) startMs = time.time() result = SharepointBootstrapResult(connectionId=connectionId) logger.info( "ingestion.connection.bootstrap.started part=sharepoint connectionId=%s", connectionId, extra={ "event": "ingestion.connection.bootstrap.started", "part": "sharepoint", "connectionId": connectionId, }, ) if adapter is None or knowledgeService is None or connection is None: adapter, connection, knowledgeService = await _resolveDependencies(connectionId) if runExtractionFn is None: from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction from modules.serviceCenter.services.serviceExtraction.subRegistry import ( ExtractorRegistry, ChunkerRegistry, ) extractorRegistry = ExtractorRegistry() chunkerRegistry = ChunkerRegistry() def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef] return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options) mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" try: sites = await adapter.browse("/", limit=limits.maxSites) except Exception as exc: logger.error("sharepoint site discovery failed for %s: %s", connectionId, exc, exc_info=True) result.errors.append(f"site_discovery: {exc}") return _finalizeResult(connectionId, result, startMs) for site in sites[: limits.maxSites]: if result.indexed + result.skippedDuplicate >= limits.maxItems: break sitePath = getattr(site, "path", "") or "" try: await _walkFolder( adapter=adapter, knowledgeService=knowledgeService, runExtractionFn=runExtractionFn, connectionId=connectionId, mandateId=mandateId, userId=userId, folderPath=sitePath, depth=0, limits=limits, result=result, progressCb=progressCb, ) except Exception as exc: logger.error("sharepoint walk failed for site %s: %s", sitePath, exc, exc_info=True) result.errors.append(f"walk({sitePath}): {exc}") return _finalizeResult(connectionId, result, startMs) async def _resolveDependencies(connectionId: str): """Load connection, instantiate SharepointAdapter, and build a KnowledgeService. Runs with root privileges: bootstrap is a system operation triggered by an authenticated user via callback; it must not be gated by a per-user service-center context. """ from modules.interfaces.interfaceDbApp import getRootInterface from modules.auth import TokenManager from modules.connectors.providerMsft.connectorMsft import MsftConnector from modules.serviceCenter import getService from modules.serviceCenter.context import ServiceCenterContext from modules.security.rootAccess import getRootUser rootInterface = getRootInterface() connection = rootInterface.getUserConnectionById(connectionId) if connection is None: raise ValueError(f"UserConnection not found: {connectionId}") token = TokenManager().getFreshToken(connectionId) if not token or not token.tokenAccess: raise ValueError(f"No valid token for connection {connectionId}") provider = MsftConnector(connection, token.tokenAccess) adapter = provider.getServiceAdapter("sharepoint") rootUser = getRootUser() ctx = ServiceCenterContext( user=rootUser, mandate_id=str(getattr(connection, "mandateId", "") or ""), ) knowledgeService = getService("knowledge", ctx) return adapter, connection, knowledgeService async def _walkFolder( *, adapter, knowledgeService, runExtractionFn, connectionId: str, mandateId: str, userId: str, folderPath: str, depth: int, limits: SharepointBootstrapLimits, result: SharepointBootstrapResult, progressCb: Optional[Callable[[int, Optional[str]], None]], ) -> None: if depth > limits.maxDepth: return try: entries = await adapter.browse(folderPath) except Exception as exc: logger.warning("sharepoint browse %s failed: %s", folderPath, exc) result.errors.append(f"browse({folderPath}): {exc}") return for entry in entries: if result.indexed + result.skippedDuplicate >= limits.maxItems: return if result.bytesProcessed >= limits.maxBytes: return entryPath = getattr(entry, "path", "") or "" if getattr(entry, "isFolder", False): await _walkFolder( adapter=adapter, knowledgeService=knowledgeService, runExtractionFn=runExtractionFn, connectionId=connectionId, mandateId=mandateId, userId=userId, folderPath=entryPath, depth=depth + 1, limits=limits, result=result, progressCb=progressCb, ) continue mimeType = getattr(entry, "mimeType", None) or "application/octet-stream" if any(mimeType.startswith(prefix) for prefix in limits.skipMimePrefixes): result.skippedPolicy += 1 continue size = int(getattr(entry, "size", 0) or 0) if size and size > limits.maxFileSize: result.skippedPolicy += 1 continue metadata = getattr(entry, "metadata", {}) or {} externalItemId = metadata.get("id") or entryPath revision = metadata.get("revision") or metadata.get("lastModifiedDateTime") await _ingestOne( adapter=adapter, knowledgeService=knowledgeService, runExtractionFn=runExtractionFn, connectionId=connectionId, mandateId=mandateId, userId=userId, entry=entry, entryPath=entryPath, mimeType=mimeType, externalItemId=externalItemId, revision=revision, limits=limits, result=result, progressCb=progressCb, ) async def _ingestOne( *, adapter, knowledgeService, runExtractionFn, connectionId: str, mandateId: str, userId: str, entry, entryPath: str, mimeType: str, externalItemId: str, revision: Optional[str], limits: SharepointBootstrapLimits, result: SharepointBootstrapResult, progressCb: Optional[Callable[[int, Optional[str]], None]], ) -> None: from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob syntheticFileId = _syntheticFileId(connectionId, externalItemId) fileName = getattr(entry, "name", "") or externalItemId try: fileBytes = await adapter.download(entryPath) except Exception as exc: logger.warning("sharepoint download %s failed: %s", entryPath, exc) result.failed += 1 result.errors.append(f"download({entryPath}): {exc}") return if not fileBytes: result.failed += 1 return result.bytesProcessed += len(fileBytes) try: extracted = runExtractionFn( fileBytes, fileName, mimeType, ExtractionOptions(mergeStrategy=None), ) except Exception as exc: logger.warning("sharepoint extraction %s failed: %s", entryPath, exc) result.failed += 1 result.errors.append(f"extract({entryPath}): {exc}") return contentObjects = _toContentObjects(extracted, fileName) if not contentObjects: result.skippedPolicy += 1 return provenance: Dict[str, Any] = { "connectionId": connectionId, "authority": "msft", "service": "sharepoint", "externalItemId": externalItemId, "externalPath": entryPath, "revision": revision, } try: handle = await knowledgeService.requestIngestion( IngestionJob( sourceKind="sharepoint_item", sourceId=syntheticFileId, fileName=fileName, mimeType=mimeType, userId=userId, mandateId=mandateId, contentObjects=contentObjects, contentVersion=revision, neutralize=limits.neutralize, provenance=provenance, ) ) except Exception as exc: logger.error("sharepoint ingestion %s failed: %s", entryPath, exc, exc_info=True) result.failed += 1 result.errors.append(f"ingest({entryPath}): {exc}") return if handle.status == "duplicate": result.skippedDuplicate += 1 elif handle.status == "indexed": result.indexed += 1 else: result.failed += 1 if handle.error: result.errors.append(f"ingest({entryPath}): {handle.error}") if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: processed = result.indexed + result.skippedDuplicate try: progressCb( min(90, 10 + int(80 * processed / max(1, limits.maxItems))), f"sharepoint processed={processed}", ) except Exception: pass logger.info( "ingestion.connection.bootstrap.progress part=sharepoint processed=%d skippedDup=%d failed=%d", processed, result.skippedDuplicate, result.failed, extra={ "event": "ingestion.connection.bootstrap.progress", "part": "sharepoint", "connectionId": connectionId, "processed": processed, "skippedDup": result.skippedDuplicate, "failed": result.failed, }, ) # Yield so the event loop can interleave other tasks (download/extract are # CPU-ish and extraction uses sync libs; cooperative scheduling prevents # starving other workers). await asyncio.sleep(0) def _finalizeResult(connectionId: str, result: SharepointBootstrapResult, startMs: float) -> Dict[str, Any]: durationMs = int((time.time() - startMs) * 1000) logger.info( "ingestion.connection.bootstrap.done part=sharepoint connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d durationMs=%d", connectionId, result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed, durationMs, extra={ "event": "ingestion.connection.bootstrap.done", "part": "sharepoint", "connectionId": connectionId, "indexed": result.indexed, "skippedDup": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "failed": result.failed, "durationMs": durationMs, }, ) return { "connectionId": result.connectionId, "indexed": result.indexed, "skippedDuplicate": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "failed": result.failed, "bytesProcessed": result.bytesProcessed, "durationMs": durationMs, "errors": result.errors[:20], }