gateway/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py

433 lines
15 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""SharePoint bootstrap for the unified knowledge ingestion lane.
Walks the SharePoint drive(s) reachable via a UserConnection, downloads each
file-like item, runs the standard content extraction pipeline and hands the
result to `KnowledgeService.requestIngestion`. Idempotency is provided by the
ingestion façade itself; repeat bootstraps therefore produce
`ingestion.skipped.duplicate` for every unchanged item because we pass the
Graph `eTag` as `contentVersion`.
"""
from __future__ import annotations
import asyncio
import hashlib
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional
from modules.datamodels.datamodelExtraction import ExtractionOptions
logger = logging.getLogger(__name__)
MAX_ITEMS_DEFAULT = 500
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4
MAX_SITES_DEFAULT = 3
@dataclass
class SharepointBootstrapLimits:
maxItems: int = MAX_ITEMS_DEFAULT
maxBytes: int = MAX_BYTES_DEFAULT
maxFileSize: int = MAX_FILE_SIZE_DEFAULT
skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
maxDepth: int = MAX_DEPTH_DEFAULT
maxSites: int = MAX_SITES_DEFAULT
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
@dataclass
class SharepointBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
bytesProcessed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
"""Deterministic synthetic FileContentIndex id for a SharePoint item.
Stable across bootstraps → idempotency works; independent of file name so
moves/renames don't duplicate chunks.
"""
token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
return f"sp:{connectionId[:8]}:{token}"
def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
"""Translate ExtractionResult → content objects accepted by requestIngestion."""
parts = getattr(extracted, "parts", None) or []
out: List[Dict[str, Any]] = []
for part in parts:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
out.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "file",
**(getattr(part, "metadata", None) or {}),
},
})
return out
async def bootstrapSharepoint(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[SharepointBootstrapLimits] = None,
runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
"""Enumerate SharePoint drives and ingest every reachable file via the façade.
Parameters allow injection for tests; production callers pass only
`connectionId` (and optionally a progressCb) and everything else is
resolved against the registered services.
"""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = SharepointBootstrapLimits(neutralize=prefs.neutralizeBeforeEmbed)
startMs = time.time()
result = SharepointBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=sharepoint connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "sharepoint",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
if runExtractionFn is None:
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
try:
sites = await adapter.browse("/", limit=limits.maxSites)
except Exception as exc:
logger.error("sharepoint site discovery failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"site_discovery: {exc}")
return _finalizeResult(connectionId, result, startMs)
for site in sites[: limits.maxSites]:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
break
sitePath = getattr(site, "path", "") or ""
try:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=sitePath,
depth=0,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("sharepoint walk failed for site %s: %s", sitePath, exc, exc_info=True)
result.errors.append(f"walk({sitePath}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
"""Load connection, instantiate SharepointAdapter, and build a KnowledgeService.
Runs with root privileges: bootstrap is a system operation triggered by an
authenticated user via callback; it must not be gated by a per-user
service-center context.
"""
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerMsft.connectorMsft import MsftConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = MsftConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("sharepoint")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _walkFolder(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
folderPath: str,
depth: int,
limits: SharepointBootstrapLimits,
result: SharepointBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
if depth > limits.maxDepth:
return
try:
entries = await adapter.browse(folderPath)
except Exception as exc:
logger.warning("sharepoint browse %s failed: %s", folderPath, exc)
result.errors.append(f"browse({folderPath}): {exc}")
return
for entry in entries:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
return
if result.bytesProcessed >= limits.maxBytes:
return
entryPath = getattr(entry, "path", "") or ""
if getattr(entry, "isFolder", False):
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=entryPath,
depth=depth + 1,
limits=limits,
result=result,
progressCb=progressCb,
)
continue
mimeType = getattr(entry, "mimeType", None) or "application/octet-stream"
if any(mimeType.startswith(prefix) for prefix in limits.skipMimePrefixes):
result.skippedPolicy += 1
continue
size = int(getattr(entry, "size", 0) or 0)
if size and size > limits.maxFileSize:
result.skippedPolicy += 1
continue
metadata = getattr(entry, "metadata", {}) or {}
externalItemId = metadata.get("id") or entryPath
revision = metadata.get("revision") or metadata.get("lastModifiedDateTime")
await _ingestOne(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
entry=entry,
entryPath=entryPath,
mimeType=mimeType,
externalItemId=externalItemId,
revision=revision,
limits=limits,
result=result,
progressCb=progressCb,
)
async def _ingestOne(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
entry,
entryPath: str,
mimeType: str,
externalItemId: str,
revision: Optional[str],
limits: SharepointBootstrapLimits,
result: SharepointBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
syntheticFileId = _syntheticFileId(connectionId, externalItemId)
fileName = getattr(entry, "name", "") or externalItemId
try:
fileBytes = await adapter.download(entryPath)
except Exception as exc:
logger.warning("sharepoint download %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"download({entryPath}): {exc}")
return
if not fileBytes:
result.failed += 1
return
result.bytesProcessed += len(fileBytes)
try:
extracted = runExtractionFn(
fileBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("sharepoint extraction %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"extract({entryPath}): {exc}")
return
contentObjects = _toContentObjects(extracted, fileName)
if not contentObjects:
result.skippedPolicy += 1
return
provenance: Dict[str, Any] = {
"connectionId": connectionId,
"authority": "msft",
"service": "sharepoint",
"externalItemId": externalItemId,
"externalPath": entryPath,
"revision": revision,
}
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="sharepoint_item",
sourceId=syntheticFileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
neutralize=limits.neutralize,
provenance=provenance,
)
)
except Exception as exc:
logger.error("sharepoint ingestion %s failed: %s", entryPath, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({entryPath}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if handle.error:
result.errors.append(f"ingest({entryPath}): {handle.error}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
f"sharepoint processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=sharepoint processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "sharepoint",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
# Yield so the event loop can interleave other tasks (download/extract are
# CPU-ish and extraction uses sync libs; cooperative scheduling prevents
# starving other workers).
await asyncio.sleep(0)
def _finalizeResult(connectionId: str, result: SharepointBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=sharepoint connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed,
durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "sharepoint",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytesProcessed": result.bytesProcessed,
"durationMs": durationMs,
"errors": result.errors[:20],
}