gateway/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Google Drive bootstrap for the unified knowledge ingestion lane.

Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the
user's *My Drive* tree from the virtual `root` folder, downloads each
file-like item via `DriveAdapter.download` (which handles native Google docs
via export), runs the standard extraction pipeline and routes results through
`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and
`contentVersion = modifiedTime` (monotonic per-revision).
"""

from __future__ import annotations

import asyncio
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional

from modules.datamodels.datamodelExtraction import ExtractionOptions

logger = logging.getLogger(__name__)

MAX_ITEMS_DEFAULT = 500
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4
MAX_AGE_DAYS_DEFAULT = 365

FOLDER_MIME = "application/vnd.google-apps.folder"


@dataclass
class GdriveBootstrapLimits:
    maxItems: int = MAX_ITEMS_DEFAULT
    maxBytes: int = MAX_BYTES_DEFAULT
    maxFileSize: int = MAX_FILE_SIZE_DEFAULT
    skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
    maxDepth: int = MAX_DEPTH_DEFAULT
    maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
    neutralize: bool = False


@dataclass
class GdriveBootstrapResult:
    connectionId: str
    indexed: int = 0
    skippedDuplicate: int = 0
    skippedPolicy: int = 0
    failed: int = 0
    bytesProcessed: int = 0
    errors: List[str] = field(default_factory=list)


def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
    token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
    return f"gd:{connectionId[:8]}:{token}"


def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
    parts = getattr(extracted, "parts", None) or []
    out: List[Dict[str, Any]] = []
    for part in parts:
        data = getattr(part, "data", None) or ""
        if not data or not str(data).strip():
            continue
        typeGroup = getattr(part, "typeGroup", "text") or "text"
        contentType = "text"
        if typeGroup == "image":
            contentType = "image"
        elif typeGroup in ("binary", "container"):
            contentType = "other"
        out.append({
            "contentObjectId": getattr(part, "id", ""),
            "contentType": contentType,
            "data": data,
            "contextRef": {
                "containerPath": fileName,
                "location": getattr(part, "label", None) or "file",
                **(getattr(part, "metadata", None) or {}),
            },
        })
    return out


def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
    if not maxAgeDays:
        return True
    if not modifiedIso:
        return True
    try:
        ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00"))
    except Exception:
        return True
    cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
    if ts.tzinfo is None:
        ts = ts.replace(tzinfo=timezone.utc)
    return ts >= cutoff


async def bootstrapGdrive(
    connectionId: str,
    *,
    dataSources: Optional[List[Dict[str, Any]]] = None,
    progressCb: Optional[Any] = None,
    adapter: Any = None,
    connection: Any = None,
    knowledgeService: Any = None,
    limits: Optional[GdriveBootstrapLimits] = None,
    runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
    """Walk My Drive starting from the virtual root folder.

    Iterates only over explicitly provided dataSources (ragIndexEnabled=true).
    Each DataSource defines the root path + neutralize policy for its subtree.
    """
    if not dataSources:
        return {"connectionId": connectionId, "skipped": True, "reason": "no_datasources"}

    if not limits:
        limits = GdriveBootstrapLimits()

    startMs = time.time()
    result = GdriveBootstrapResult(connectionId=connectionId)

    logger.info(
        "ingestion.connection.bootstrap.started part=gdrive connectionId=%s dataSources=%d",
        connectionId, len(dataSources),
        extra={
            "event": "ingestion.connection.bootstrap.started",
            "part": "gdrive",
            "connectionId": connectionId,
            "dataSourceCount": len(dataSources),
        },
    )

    if adapter is None or knowledgeService is None or connection is None:
        adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
    if runExtractionFn is None:
        from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
        from modules.serviceCenter.services.serviceExtraction.subRegistry import (
            ExtractorRegistry, ChunkerRegistry,
        )
        extractorRegistry = ExtractorRegistry()
        chunkerRegistry = ChunkerRegistry()

        def runExtractionFn(bytesData, name, mime, options):  # type: ignore[no-redef]
            return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)

    mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
    userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""

    cancelled = False
    for ds in dataSources:
        if result.indexed + result.skippedDuplicate >= limits.maxItems:
            break
        if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
            cancelled = True
            break

        dsPath = ds.get("path", "/")
        dsId = ds.get("id", "")
        dsNeutralize = ds.get("neutralize", False)
        dsMaxAgeDays = ds.get("maxAgeDays", limits.maxAgeDays)
        dsLimits = GdriveBootstrapLimits(
            maxItems=limits.maxItems,
            maxBytes=limits.maxBytes,
            maxFileSize=limits.maxFileSize,
            skipMimePrefixes=limits.skipMimePrefixes,
            maxDepth=limits.maxDepth,
            maxAgeDays=dsMaxAgeDays,
            neutralize=dsNeutralize,
        )

        try:
            await _walkFolder(
                adapter=adapter,
                knowledgeService=knowledgeService,
                runExtractionFn=runExtractionFn,
                connectionId=connectionId,
                mandateId=mandateId,
                userId=userId,
                folderPath=dsPath,
                depth=0,
                limits=dsLimits,
                result=result,
                progressCb=progressCb,
                dataSourceId=dsId,
            )
        except Exception as exc:
            logger.error("gdrive walk failed for ds %s path %s: %s", dsId, dsPath, exc, exc_info=True)
            result.errors.append(f"walk({dsPath}): {exc}")

    finalResult = _finalizeResult(connectionId, result, startMs)
    if cancelled:
        finalResult["cancelled"] = True
    return finalResult


async def _resolveDependencies(connectionId: str):
    from modules.interfaces.interfaceDbApp import getRootInterface
    from modules.auth import TokenManager
    from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
    from modules.serviceCenter import getService
    from modules.serviceCenter.context import ServiceCenterContext
    from modules.security.rootAccess import getRootUser

    rootInterface = getRootInterface()
    connection = rootInterface.getUserConnectionById(connectionId)
    if connection is None:
        raise ValueError(f"UserConnection not found: {connectionId}")

    token = TokenManager().getFreshToken(connectionId)
    if not token or not token.tokenAccess:
        raise ValueError(f"No valid token for connection {connectionId}")

    provider = GoogleConnector(connection, token.tokenAccess)
    adapter = provider.getServiceAdapter("drive")

    rootUser = getRootUser()
    ctx = ServiceCenterContext(
        user=rootUser,
        mandate_id=str(getattr(connection, "mandateId", "") or ""),
    )
    knowledgeService = getService("knowledge", ctx)
    return adapter, connection, knowledgeService


async def _walkFolder(
    *,
    adapter,
    knowledgeService,
    runExtractionFn,
    connectionId: str,
    mandateId: str,
    userId: str,
    folderPath: str,
    depth: int,
    limits: GdriveBootstrapLimits,
    result: GdriveBootstrapResult,
    progressCb: Optional[Any],
    dataSourceId: str = "",
) -> None:
    if depth > limits.maxDepth:
        return
    if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
        return
    try:
        entries = await adapter.browse(folderPath)
    except Exception as exc:
        logger.warning("gdrive browse %s failed: %s", folderPath, exc)
        result.errors.append(f"browse({folderPath}): {exc}")
        return

    for entry in entries:
        if result.indexed + result.skippedDuplicate >= limits.maxItems:
            return
        if result.bytesProcessed >= limits.maxBytes:
            return
        if progressCb and hasattr(progressCb, "isCancelled") and (result.indexed + result.skippedDuplicate) % 50 == 0 and progressCb.isCancelled():
            return

        entryPath = getattr(entry, "path", "") or ""
        metadata = getattr(entry, "metadata", {}) or {}
        mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType")

        if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME:
            await _walkFolder(
                adapter=adapter,
                knowledgeService=knowledgeService,
                runExtractionFn=runExtractionFn,
                connectionId=connectionId,
                mandateId=mandateId,
                userId=userId,
                folderPath=entryPath,
                depth=depth + 1,
                limits=limits,
                result=result,
                progressCb=progressCb,
                dataSourceId=dataSourceId,
            )
            continue

        effectiveMime = mimeType or "application/octet-stream"
        if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes):
            result.skippedPolicy += 1
            continue
        size = int(getattr(entry, "size", 0) or 0)
        if size and size > limits.maxFileSize:
            result.skippedPolicy += 1
            continue
        modifiedTime = metadata.get("modifiedTime")
        if not _isRecent(modifiedTime, limits.maxAgeDays):
            result.skippedPolicy += 1
            continue

        externalItemId = metadata.get("id") or entryPath
        revision = modifiedTime

        await _ingestOne(
            adapter=adapter,
            knowledgeService=knowledgeService,
            runExtractionFn=runExtractionFn,
            connectionId=connectionId,
            mandateId=mandateId,
            userId=userId,
            entry=entry,
            entryPath=entryPath,
            mimeType=effectiveMime,
            externalItemId=externalItemId,
            revision=revision,
            limits=limits,
            result=result,
            progressCb=progressCb,
            dataSourceId=dataSourceId,
        )


async def _ingestOne(
    *,
    adapter,
    knowledgeService,
    runExtractionFn,
    connectionId: str,
    mandateId: str,
    userId: str,
    entry,
    entryPath: str,
    mimeType: str,
    externalItemId: str,
    revision: Optional[str],
    limits: GdriveBootstrapLimits,
    result: GdriveBootstrapResult,
    progressCb: Optional[Any],
    dataSourceId: str = "",
) -> None:
    from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob

    syntheticFileId = _syntheticFileId(connectionId, externalItemId)
    fileName = getattr(entry, "name", "") or externalItemId

    try:
        downloaded = await adapter.download(entryPath)
    except Exception as exc:
        logger.warning("gdrive download %s failed: %s", entryPath, exc)
        result.failed += 1
        result.errors.append(f"download({entryPath}): {exc}")
        return

    fileBytes: bytes
    if isinstance(downloaded, (bytes, bytearray)):
        fileBytes = bytes(downloaded)
    else:
        fileBytes = bytes(getattr(downloaded, "data", b"") or b"")
        if getattr(downloaded, "mimeType", None):
            mimeType = downloaded.mimeType
    if not fileBytes:
        result.failed += 1
        return
    if len(fileBytes) > limits.maxFileSize:
        result.skippedPolicy += 1
        return

    result.bytesProcessed += len(fileBytes)

    try:
        extracted = runExtractionFn(
            fileBytes, fileName, mimeType,
            ExtractionOptions(mergeStrategy=None),
        )
    except Exception as exc:
        logger.warning("gdrive extraction %s failed: %s", entryPath, exc)
        result.failed += 1
        result.errors.append(f"extract({entryPath}): {exc}")
        return

    contentObjects = _toContentObjects(extracted, fileName)
    if not contentObjects:
        result.skippedPolicy += 1
        return

    provenance: Dict[str, Any] = {
        "connectionId": connectionId,
        "dataSourceId": dataSourceId,
        "authority": "google",
        "service": "drive",
        "externalItemId": externalItemId,
        "entryPath": entryPath,
        "tier": "body",
    }
    try:
        handle = await knowledgeService.requestIngestion(
            IngestionJob(
                sourceKind="gdrive_item",
                sourceId=syntheticFileId,
                fileName=fileName,
                mimeType=mimeType,
                userId=userId,
                mandateId=mandateId,
                contentObjects=contentObjects,
                contentVersion=revision,
                neutralize=limits.neutralize,
                provenance=provenance,
            )
        )
    except Exception as exc:
        logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True)
        result.failed += 1
        result.errors.append(f"ingest({entryPath}): {exc}")
        return

    if handle.status == "duplicate":
        result.skippedDuplicate += 1
    elif handle.status == "indexed":
        result.indexed += 1
    else:
        result.failed += 1
        if handle.error:
            result.errors.append(f"ingest({entryPath}): {handle.error}")

    if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
        processed = result.indexed + result.skippedDuplicate
        try:
            progressCb(
                min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
                f"gdrive processed={processed}",
            )
        except Exception:
            pass
        logger.info(
            "ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d",
            processed, result.skippedDuplicate, result.failed,
            extra={
                "event": "ingestion.connection.bootstrap.progress",
                "part": "gdrive",
                "connectionId": connectionId,
                "processed": processed,
                "skippedDup": result.skippedDuplicate,
                "failed": result.failed,
            },
        )

    await asyncio.sleep(0)


def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]:
    durationMs = int((time.time() - startMs) * 1000)
    logger.info(
        "ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d",
        connectionId,
        result.indexed, result.skippedDuplicate, result.skippedPolicy,
        result.failed, result.bytesProcessed, durationMs,
        extra={
            "event": "ingestion.connection.bootstrap.done",
            "part": "gdrive",
            "connectionId": connectionId,
            "indexed": result.indexed,
            "skippedDup": result.skippedDuplicate,
            "skippedPolicy": result.skippedPolicy,
            "failed": result.failed,
            "bytes": result.bytesProcessed,
            "durationMs": durationMs,
        },
    )
    return {
        "connectionId": result.connectionId,
        "indexed": result.indexed,
        "skippedDuplicate": result.skippedDuplicate,
        "skippedPolicy": result.skippedPolicy,
        "failed": result.failed,
        "bytesProcessed": result.bytesProcessed,
        "durationMs": durationMs,
        "errors": result.errors[:20],
    }