gateway/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Google Drive bootstrap for the unified knowledge ingestion lane.

Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the
user's *My Drive* tree from the virtual `root` folder, downloads each
file-like item via `DriveAdapter.download` (which handles native Google docs
via export), runs the standard extraction pipeline and routes results through
`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and
`contentVersion = modifiedTime` (monotonic per-revision).
"""

from __future__ import annotations

import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional

from modules.datamodels.datamodelExtraction import ExtractionOptions

logger = logging.getLogger(__name__)

MAX_ITEMS_DEFAULT = 500
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4
MAX_AGE_DAYS_DEFAULT = 365

# Google Drive uses virtual mime-types for folders and non-downloadable assets.
FOLDER_MIME = "application/vnd.google-apps.folder"


@dataclass
class GdriveBootstrapLimits:
    maxItems: int = MAX_ITEMS_DEFAULT
    maxBytes: int = MAX_BYTES_DEFAULT
    maxFileSize: int = MAX_FILE_SIZE_DEFAULT
    skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
    maxDepth: int = MAX_DEPTH_DEFAULT
    # Only ingest files modified within the last N days. None disables filter.
    maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
    # Pass-through to IngestionJob.neutralize
    neutralize: bool = False
    # Whether to skip binary/non-text files
    filesIndexBinaries: bool = True


@dataclass
class GdriveBootstrapResult:
    connectionId: str
    indexed: int = 0
    skippedDuplicate: int = 0
    skippedPolicy: int = 0
    failed: int = 0
    bytesProcessed: int = 0
    errors: List[str] = field(default_factory=list)


def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
    token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
    return f"gd:{connectionId[:8]}:{token}"


def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
    parts = getattr(extracted, "parts", None) or []
    out: List[Dict[str, Any]] = []
    for part in parts:
        data = getattr(part, "data", None) or ""
        if not data or not str(data).strip():
            continue
        typeGroup = getattr(part, "typeGroup", "text") or "text"
        contentType = "text"
        if typeGroup == "image":
            contentType = "image"
        elif typeGroup in ("binary", "container"):
            contentType = "other"
        out.append({
            "contentObjectId": getattr(part, "id", ""),
            "contentType": contentType,
            "data": data,
            "contextRef": {
                "containerPath": fileName,
                "location": getattr(part, "label", None) or "file",
                **(getattr(part, "metadata", None) or {}),
            },
        })
    return out


def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
    if not maxAgeDays:
        return True
    if not modifiedIso:
        # No timestamp -> be permissive (Drive native docs sometimes omit it on export).
        return True
    try:
        # Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both.
        ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00"))
    except Exception:
        return True
    cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
    if ts.tzinfo is None:
        ts = ts.replace(tzinfo=timezone.utc)
    return ts >= cutoff


async def bootstrapGdrive(
    connectionId: str,
    *,
    progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
    adapter: Any = None,
    connection: Any = None,
    knowledgeService: Any = None,
    limits: Optional[GdriveBootstrapLimits] = None,
    runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
    """Walk My Drive starting from the virtual root folder."""
    from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
    prefs = loadConnectionPrefs(connectionId)

    if not limits:
        limits = GdriveBootstrapLimits(
            maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
            neutralize=prefs.neutralizeBeforeEmbed,
            filesIndexBinaries=prefs.filesIndexBinaries,
        )

    startMs = time.time()
    result = GdriveBootstrapResult(connectionId=connectionId)

    logger.info(
        "ingestion.connection.bootstrap.started part=gdrive connectionId=%s",
        connectionId,
        extra={
            "event": "ingestion.connection.bootstrap.started",
            "part": "gdrive",
            "connectionId": connectionId,
        },
    )

    if adapter is None or knowledgeService is None or connection is None:
        adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
    if runExtractionFn is None:
        from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
        from modules.serviceCenter.services.serviceExtraction.subRegistry import (
            ExtractorRegistry, ChunkerRegistry,
        )
        extractorRegistry = ExtractorRegistry()
        chunkerRegistry = ChunkerRegistry()

        def runExtractionFn(bytesData, name, mime, options):  # type: ignore[no-redef]
            return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)

    mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
    userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""

    try:
        await _walkFolder(
            adapter=adapter,
            knowledgeService=knowledgeService,
            runExtractionFn=runExtractionFn,
            connectionId=connectionId,
            mandateId=mandateId,
            userId=userId,
            folderPath="/",  # DriveAdapter.browse maps "" / "/" -> "root"
            depth=0,
            limits=limits,
            result=result,
            progressCb=progressCb,
        )
    except Exception as exc:
        logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True)
        result.errors.append(f"walk: {exc}")

    return _finalizeResult(connectionId, result, startMs)


async def _resolveDependencies(connectionId: str):
    from modules.interfaces.interfaceDbApp import getRootInterface
    from modules.auth import TokenManager
    from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
    from modules.serviceCenter import getService
    from modules.serviceCenter.context import ServiceCenterContext
    from modules.security.rootAccess import getRootUser

    rootInterface = getRootInterface()
    connection = rootInterface.getUserConnectionById(connectionId)
    if connection is None:
        raise ValueError(f"UserConnection not found: {connectionId}")

    token = TokenManager().getFreshToken(connectionId)
    if not token or not token.tokenAccess:
        raise ValueError(f"No valid token for connection {connectionId}")

    provider = GoogleConnector(connection, token.tokenAccess)
    adapter = provider.getServiceAdapter("drive")

    rootUser = getRootUser()
    ctx = ServiceCenterContext(
        user=rootUser,
        mandate_id=str(getattr(connection, "mandateId", "") or ""),
    )
    knowledgeService = getService("knowledge", ctx)
    return adapter, connection, knowledgeService


async def _walkFolder(
    *,
    adapter,
    knowledgeService,
    runExtractionFn,
    connectionId: str,
    mandateId: str,
    userId: str,
    folderPath: str,
    depth: int,
    limits: GdriveBootstrapLimits,
    result: GdriveBootstrapResult,
    progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
    if depth > limits.maxDepth:
        return
    try:
        entries = await adapter.browse(folderPath)
    except Exception as exc:
        logger.warning("gdrive browse %s failed: %s", folderPath, exc)
        result.errors.append(f"browse({folderPath}): {exc}")
        return

    for entry in entries:
        if result.indexed + result.skippedDuplicate >= limits.maxItems:
            return
        if result.bytesProcessed >= limits.maxBytes:
            return

        entryPath = getattr(entry, "path", "") or ""
        metadata = getattr(entry, "metadata", {}) or {}
        mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType")

        if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME:
            await _walkFolder(
                adapter=adapter,
                knowledgeService=knowledgeService,
                runExtractionFn=runExtractionFn,
                connectionId=connectionId,
                mandateId=mandateId,
                userId=userId,
                folderPath=entryPath,
                depth=depth + 1,
                limits=limits,
                result=result,
                progressCb=progressCb,
            )
            continue

        effectiveMime = mimeType or "application/octet-stream"
        if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes):
            result.skippedPolicy += 1
            continue
        size = int(getattr(entry, "size", 0) or 0)
        if size and size > limits.maxFileSize:
            result.skippedPolicy += 1
            continue
        modifiedTime = metadata.get("modifiedTime")
        if not _isRecent(modifiedTime, limits.maxAgeDays):
            result.skippedPolicy += 1
            continue

        externalItemId = metadata.get("id") or entryPath
        revision = modifiedTime

        await _ingestOne(
            adapter=adapter,
            knowledgeService=knowledgeService,
            runExtractionFn=runExtractionFn,
            connectionId=connectionId,
            mandateId=mandateId,
            userId=userId,
            entry=entry,
            entryPath=entryPath,
            mimeType=effectiveMime,
            externalItemId=externalItemId,
            revision=revision,
            limits=limits,
            result=result,
            progressCb=progressCb,
        )


async def _ingestOne(
    *,
    adapter,
    knowledgeService,
    runExtractionFn,
    connectionId: str,
    mandateId: str,
    userId: str,
    entry,
    entryPath: str,
    mimeType: str,
    externalItemId: str,
    revision: Optional[str],
    limits: GdriveBootstrapLimits,
    result: GdriveBootstrapResult,
    progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
    from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob

    syntheticFileId = _syntheticFileId(connectionId, externalItemId)
    fileName = getattr(entry, "name", "") or externalItemId

    try:
        downloaded = await adapter.download(entryPath)
    except Exception as exc:
        logger.warning("gdrive download %s failed: %s", entryPath, exc)
        result.failed += 1
        result.errors.append(f"download({entryPath}): {exc}")
        return

    # Adapter.download returns raw bytes today; guard DownloadResult shape too.
    fileBytes: bytes
    if isinstance(downloaded, (bytes, bytearray)):
        fileBytes = bytes(downloaded)
    else:
        fileBytes = bytes(getattr(downloaded, "data", b"") or b"")
        if getattr(downloaded, "mimeType", None):
            mimeType = downloaded.mimeType  # export may have changed the type
    if not fileBytes:
        result.failed += 1
        return
    if len(fileBytes) > limits.maxFileSize:
        result.skippedPolicy += 1
        return

    result.bytesProcessed += len(fileBytes)

    try:
        extracted = runExtractionFn(
            fileBytes, fileName, mimeType,
            ExtractionOptions(mergeStrategy=None),
        )
    except Exception as exc:
        logger.warning("gdrive extraction %s failed: %s", entryPath, exc)
        result.failed += 1
        result.errors.append(f"extract({entryPath}): {exc}")
        return

    contentObjects = _toContentObjects(extracted, fileName)
    if not contentObjects:
        result.skippedPolicy += 1
        return

    try:
        handle = await knowledgeService.requestIngestion(
            IngestionJob(
                sourceKind="gdrive_item",
                sourceId=syntheticFileId,
                fileName=fileName,
                mimeType=mimeType,
                userId=userId,
                mandateId=mandateId,
                contentObjects=contentObjects,
                contentVersion=revision,
                neutralize=limits.neutralize,
                provenance={
                    "connectionId": connectionId,
                    "authority": "google",
                    "service": "drive",
                    "externalItemId": externalItemId,
                    "entryPath": entryPath,
                    "tier": "body",
                },
            )
        )
    except Exception as exc:
        logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True)
        result.failed += 1
        result.errors.append(f"ingest({entryPath}): {exc}")
        return

    if handle.status == "duplicate":
        result.skippedDuplicate += 1
    elif handle.status == "indexed":
        result.indexed += 1
    else:
        result.failed += 1

    if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
        processed = result.indexed + result.skippedDuplicate
        try:
            progressCb(
                min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
                f"gdrive processed={processed}",
            )
        except Exception:
            pass
        logger.info(
            "ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d",
            processed, result.skippedDuplicate, result.failed,
            extra={
                "event": "ingestion.connection.bootstrap.progress",
                "part": "gdrive",
                "connectionId": connectionId,
                "processed": processed,
                "skippedDup": result.skippedDuplicate,
                "failed": result.failed,
            },
        )


def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]:
    durationMs = int((time.time() - startMs) * 1000)
    logger.info(
        "ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d",
        connectionId,
        result.indexed, result.skippedDuplicate, result.skippedPolicy,
        result.failed, result.bytesProcessed, durationMs,
        extra={
            "event": "ingestion.connection.bootstrap.done",
            "part": "gdrive",
            "connectionId": connectionId,
            "indexed": result.indexed,
            "skippedDup": result.skippedDuplicate,
            "skippedPolicy": result.skippedPolicy,
            "failed": result.failed,
            "bytes": result.bytesProcessed,
            "durationMs": durationMs,
        },
    )
    return {
        "connectionId": result.connectionId,
        "indexed": result.indexed,
        "skippedDuplicate": result.skippedDuplicate,
        "skippedPolicy": result.skippedPolicy,
        "failed": result.failed,
        "bytesProcessed": result.bytesProcessed,
        "durationMs": durationMs,
        "errors": result.errors[:20],
    }