gateway/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""SharePoint bootstrap for the unified knowledge ingestion lane.

Walks the SharePoint drive(s) reachable via a UserConnection, downloads each
file-like item, runs the standard content extraction pipeline and hands the
result to `KnowledgeService.requestIngestion`. Idempotency is provided by the
ingestion façade itself; repeat bootstraps therefore produce
`ingestion.skipped.duplicate` for every unchanged item because we pass the
Graph `eTag` as `contentVersion`.
"""

from __future__ import annotations

import asyncio
import hashlib
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional

from modules.datamodels.datamodelExtraction import ExtractionOptions

logger = logging.getLogger(__name__)

MAX_ITEMS_DEFAULT = 500
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4
MAX_SITES_DEFAULT = 3


@dataclass
class SharepointBootstrapLimits:
    maxItems: int = MAX_ITEMS_DEFAULT
    maxBytes: int = MAX_BYTES_DEFAULT
    maxFileSize: int = MAX_FILE_SIZE_DEFAULT
    skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
    maxDepth: int = MAX_DEPTH_DEFAULT
    maxSites: int = MAX_SITES_DEFAULT
    # Pass-through to IngestionJob.neutralize
    neutralize: bool = False


@dataclass
class SharepointBootstrapResult:
    connectionId: str
    indexed: int = 0
    skippedDuplicate: int = 0
    skippedPolicy: int = 0
    failed: int = 0
    bytesProcessed: int = 0
    errors: List[str] = field(default_factory=list)


def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
    """Deterministic synthetic FileContentIndex id for a SharePoint item.

    Stable across bootstraps → idempotency works; independent of file name so
    moves/renames don't duplicate chunks.
    """
    token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
    return f"sp:{connectionId[:8]}:{token}"


def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
    """Translate ExtractionResult → content objects accepted by requestIngestion."""
    parts = getattr(extracted, "parts", None) or []
    out: List[Dict[str, Any]] = []
    for part in parts:
        data = getattr(part, "data", None) or ""
        if not data or not str(data).strip():
            continue
        typeGroup = getattr(part, "typeGroup", "text") or "text"
        contentType = "text"
        if typeGroup == "image":
            contentType = "image"
        elif typeGroup in ("binary", "container"):
            contentType = "other"
        out.append({
            "contentObjectId": getattr(part, "id", ""),
            "contentType": contentType,
            "data": data,
            "contextRef": {
                "containerPath": fileName,
                "location": getattr(part, "label", None) or "file",
                **(getattr(part, "metadata", None) or {}),
            },
        })
    return out


async def bootstrapSharepoint(
    connectionId: str,
    *,
    progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
    adapter: Any = None,
    connection: Any = None,
    knowledgeService: Any = None,
    limits: Optional[SharepointBootstrapLimits] = None,
    runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
    """Enumerate SharePoint drives and ingest every reachable file via the façade.

    Parameters allow injection for tests; production callers pass only
    `connectionId` (and optionally a progressCb) and everything else is
    resolved against the registered services.
    """
    from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
    prefs = loadConnectionPrefs(connectionId)

    if not limits:
        limits = SharepointBootstrapLimits(neutralize=prefs.neutralizeBeforeEmbed)

    startMs = time.time()
    result = SharepointBootstrapResult(connectionId=connectionId)

    logger.info(
        "ingestion.connection.bootstrap.started part=sharepoint connectionId=%s",
        connectionId,
        extra={
            "event": "ingestion.connection.bootstrap.started",
            "part": "sharepoint",
            "connectionId": connectionId,
        },
    )

    if adapter is None or knowledgeService is None or connection is None:
        adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
    if runExtractionFn is None:
        from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
        from modules.serviceCenter.services.serviceExtraction.subRegistry import (
            ExtractorRegistry, ChunkerRegistry,
        )
        extractorRegistry = ExtractorRegistry()
        chunkerRegistry = ChunkerRegistry()

        def runExtractionFn(bytesData, name, mime, options):  # type: ignore[no-redef]
            return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)

    mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
    userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""

    try:
        sites = await adapter.browse("/", limit=limits.maxSites)
    except Exception as exc:
        logger.error("sharepoint site discovery failed for %s: %s", connectionId, exc, exc_info=True)
        result.errors.append(f"site_discovery: {exc}")
        return _finalizeResult(connectionId, result, startMs)

    for site in sites[: limits.maxSites]:
        if result.indexed + result.skippedDuplicate >= limits.maxItems:
            break
        sitePath = getattr(site, "path", "") or ""
        try:
            await _walkFolder(
                adapter=adapter,
                knowledgeService=knowledgeService,
                runExtractionFn=runExtractionFn,
                connectionId=connectionId,
                mandateId=mandateId,
                userId=userId,
                folderPath=sitePath,
                depth=0,
                limits=limits,
                result=result,
                progressCb=progressCb,
            )
        except Exception as exc:
            logger.error("sharepoint walk failed for site %s: %s", sitePath, exc, exc_info=True)
            result.errors.append(f"walk({sitePath}): {exc}")

    return _finalizeResult(connectionId, result, startMs)


async def _resolveDependencies(connectionId: str):
    """Load connection, instantiate SharepointAdapter, and build a KnowledgeService.

    Runs with root privileges: bootstrap is a system operation triggered by an
    authenticated user via callback; it must not be gated by a per-user
    service-center context.
    """
    from modules.interfaces.interfaceDbApp import getRootInterface
    from modules.auth import TokenManager
    from modules.connectors.providerMsft.connectorMsft import MsftConnector
    from modules.serviceCenter import getService
    from modules.serviceCenter.context import ServiceCenterContext
    from modules.security.rootAccess import getRootUser

    rootInterface = getRootInterface()
    connection = rootInterface.getUserConnectionById(connectionId)
    if connection is None:
        raise ValueError(f"UserConnection not found: {connectionId}")

    token = TokenManager().getFreshToken(connectionId)
    if not token or not token.tokenAccess:
        raise ValueError(f"No valid token for connection {connectionId}")

    provider = MsftConnector(connection, token.tokenAccess)
    adapter = provider.getServiceAdapter("sharepoint")

    rootUser = getRootUser()
    ctx = ServiceCenterContext(
        user=rootUser,
        mandate_id=str(getattr(connection, "mandateId", "") or ""),
    )
    knowledgeService = getService("knowledge", ctx)
    return adapter, connection, knowledgeService


async def _walkFolder(
    *,
    adapter,
    knowledgeService,
    runExtractionFn,
    connectionId: str,
    mandateId: str,
    userId: str,
    folderPath: str,
    depth: int,
    limits: SharepointBootstrapLimits,
    result: SharepointBootstrapResult,
    progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
    if depth > limits.maxDepth:
        return
    try:
        entries = await adapter.browse(folderPath)
    except Exception as exc:
        logger.warning("sharepoint browse %s failed: %s", folderPath, exc)
        result.errors.append(f"browse({folderPath}): {exc}")
        return

    for entry in entries:
        if result.indexed + result.skippedDuplicate >= limits.maxItems:
            return
        if result.bytesProcessed >= limits.maxBytes:
            return

        entryPath = getattr(entry, "path", "") or ""
        if getattr(entry, "isFolder", False):
            await _walkFolder(
                adapter=adapter,
                knowledgeService=knowledgeService,
                runExtractionFn=runExtractionFn,
                connectionId=connectionId,
                mandateId=mandateId,
                userId=userId,
                folderPath=entryPath,
                depth=depth + 1,
                limits=limits,
                result=result,
                progressCb=progressCb,
            )
            continue

        mimeType = getattr(entry, "mimeType", None) or "application/octet-stream"
        if any(mimeType.startswith(prefix) for prefix in limits.skipMimePrefixes):
            result.skippedPolicy += 1
            continue
        size = int(getattr(entry, "size", 0) or 0)
        if size and size > limits.maxFileSize:
            result.skippedPolicy += 1
            continue

        metadata = getattr(entry, "metadata", {}) or {}
        externalItemId = metadata.get("id") or entryPath
        revision = metadata.get("revision") or metadata.get("lastModifiedDateTime")

        await _ingestOne(
            adapter=adapter,
            knowledgeService=knowledgeService,
            runExtractionFn=runExtractionFn,
            connectionId=connectionId,
            mandateId=mandateId,
            userId=userId,
            entry=entry,
            entryPath=entryPath,
            mimeType=mimeType,
            externalItemId=externalItemId,
            revision=revision,
            limits=limits,
            result=result,
            progressCb=progressCb,
        )


async def _ingestOne(
    *,
    adapter,
    knowledgeService,
    runExtractionFn,
    connectionId: str,
    mandateId: str,
    userId: str,
    entry,
    entryPath: str,
    mimeType: str,
    externalItemId: str,
    revision: Optional[str],
    limits: SharepointBootstrapLimits,
    result: SharepointBootstrapResult,
    progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
    from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob

    syntheticFileId = _syntheticFileId(connectionId, externalItemId)
    fileName = getattr(entry, "name", "") or externalItemId

    try:
        fileBytes = await adapter.download(entryPath)
    except Exception as exc:
        logger.warning("sharepoint download %s failed: %s", entryPath, exc)
        result.failed += 1
        result.errors.append(f"download({entryPath}): {exc}")
        return
    if not fileBytes:
        result.failed += 1
        return

    result.bytesProcessed += len(fileBytes)

    try:
        extracted = runExtractionFn(
            fileBytes, fileName, mimeType,
            ExtractionOptions(mergeStrategy=None),
        )
    except Exception as exc:
        logger.warning("sharepoint extraction %s failed: %s", entryPath, exc)
        result.failed += 1
        result.errors.append(f"extract({entryPath}): {exc}")
        return

    contentObjects = _toContentObjects(extracted, fileName)
    if not contentObjects:
        result.skippedPolicy += 1
        return

    provenance: Dict[str, Any] = {
        "connectionId": connectionId,
        "authority": "msft",
        "service": "sharepoint",
        "externalItemId": externalItemId,
        "externalPath": entryPath,
        "revision": revision,
    }
    try:
        handle = await knowledgeService.requestIngestion(
            IngestionJob(
                sourceKind="sharepoint_item",
                sourceId=syntheticFileId,
                fileName=fileName,
                mimeType=mimeType,
                userId=userId,
                mandateId=mandateId,
                contentObjects=contentObjects,
                contentVersion=revision,
                neutralize=limits.neutralize,
                provenance=provenance,
            )
        )
    except Exception as exc:
        logger.error("sharepoint ingestion %s failed: %s", entryPath, exc, exc_info=True)
        result.failed += 1
        result.errors.append(f"ingest({entryPath}): {exc}")
        return

    if handle.status == "duplicate":
        result.skippedDuplicate += 1
    elif handle.status == "indexed":
        result.indexed += 1
    else:
        result.failed += 1
        if handle.error:
            result.errors.append(f"ingest({entryPath}): {handle.error}")

    if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
        processed = result.indexed + result.skippedDuplicate
        try:
            progressCb(
                min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
                f"sharepoint processed={processed}",
            )
        except Exception:
            pass
        logger.info(
            "ingestion.connection.bootstrap.progress part=sharepoint processed=%d skippedDup=%d failed=%d",
            processed, result.skippedDuplicate, result.failed,
            extra={
                "event": "ingestion.connection.bootstrap.progress",
                "part": "sharepoint",
                "connectionId": connectionId,
                "processed": processed,
                "skippedDup": result.skippedDuplicate,
                "failed": result.failed,
            },
        )

    # Yield so the event loop can interleave other tasks (download/extract are
    # CPU-ish and extraction uses sync libs; cooperative scheduling prevents
    # starving other workers).
    await asyncio.sleep(0)


def _finalizeResult(connectionId: str, result: SharepointBootstrapResult, startMs: float) -> Dict[str, Any]:
    durationMs = int((time.time() - startMs) * 1000)
    logger.info(
        "ingestion.connection.bootstrap.done part=sharepoint connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d durationMs=%d",
        connectionId,
        result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed,
        durationMs,
        extra={
            "event": "ingestion.connection.bootstrap.done",
            "part": "sharepoint",
            "connectionId": connectionId,
            "indexed": result.indexed,
            "skippedDup": result.skippedDuplicate,
            "skippedPolicy": result.skippedPolicy,
            "failed": result.failed,
            "durationMs": durationMs,
        },
    )
    return {
        "connectionId": result.connectionId,
        "indexed": result.indexed,
        "skippedDuplicate": result.skippedDuplicate,
        "skippedPolicy": result.skippedPolicy,
        "failed": result.failed,
        "bytesProcessed": result.bytesProcessed,
        "durationMs": durationMs,
        "errors": result.errors[:20],
    }