# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Google Drive bootstrap for the unified knowledge ingestion lane. Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the user's *My Drive* tree from the virtual `root` folder, downloads each file-like item via `DriveAdapter.download` (which handles native Google docs via export), runs the standard extraction pipeline and routes results through `KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and `contentVersion = modifiedTime` (monotonic per-revision). """ from __future__ import annotations import asyncio import hashlib import logging import time from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone from typing import Any, Callable, Dict, List, Optional from modules.datamodels.datamodelExtraction import ExtractionOptions logger = logging.getLogger(__name__) MAX_ITEMS_DEFAULT = 500 MAX_BYTES_DEFAULT = 200 * 1024 * 1024 MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024 SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/") MAX_DEPTH_DEFAULT = 4 MAX_AGE_DAYS_DEFAULT = 365 FOLDER_MIME = "application/vnd.google-apps.folder" @dataclass class GdriveBootstrapLimits: maxItems: int = MAX_ITEMS_DEFAULT maxBytes: int = MAX_BYTES_DEFAULT maxFileSize: int = MAX_FILE_SIZE_DEFAULT skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT maxDepth: int = MAX_DEPTH_DEFAULT maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT neutralize: bool = False @dataclass class GdriveBootstrapResult: connectionId: str indexed: int = 0 skippedDuplicate: int = 0 skippedPolicy: int = 0 failed: int = 0 bytesProcessed: int = 0 errors: List[str] = field(default_factory=list) def _syntheticFileId(connectionId: str, externalItemId: str) -> str: token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16] return f"gd:{connectionId[:8]}:{token}" def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]: parts = getattr(extracted, "parts", None) or [] out: List[Dict[str, Any]] = [] for part in parts: data = getattr(part, "data", None) or "" if not data or not str(data).strip(): continue typeGroup = getattr(part, "typeGroup", "text") or "text" contentType = "text" if typeGroup == "image": contentType = "image" elif typeGroup in ("binary", "container"): contentType = "other" out.append({ "contentObjectId": getattr(part, "id", ""), "contentType": contentType, "data": data, "contextRef": { "containerPath": fileName, "location": getattr(part, "label", None) or "file", **(getattr(part, "metadata", None) or {}), }, }) return out def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool: if not maxAgeDays: return True if not modifiedIso: return True try: ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00")) except Exception: return True cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays) if ts.tzinfo is None: ts = ts.replace(tzinfo=timezone.utc) return ts >= cutoff async def bootstrapGdrive( connectionId: str, *, dataSources: Optional[List[Dict[str, Any]]] = None, progressCb: Optional[Any] = None, adapter: Any = None, connection: Any = None, knowledgeService: Any = None, limits: Optional[GdriveBootstrapLimits] = None, runExtractionFn: Optional[Callable[..., Any]] = None, ) -> Dict[str, Any]: """Walk My Drive starting from the virtual root folder. Iterates only over explicitly provided dataSources (ragIndexEnabled=true). Each DataSource defines the root path + neutralize policy for its subtree. """ if not dataSources: return {"connectionId": connectionId, "skipped": True, "reason": "no_datasources"} if not limits: limits = GdriveBootstrapLimits() startMs = time.time() result = GdriveBootstrapResult(connectionId=connectionId) logger.info( "ingestion.connection.bootstrap.started part=gdrive connectionId=%s dataSources=%d", connectionId, len(dataSources), extra={ "event": "ingestion.connection.bootstrap.started", "part": "gdrive", "connectionId": connectionId, "dataSourceCount": len(dataSources), }, ) if adapter is None or knowledgeService is None or connection is None: adapter, connection, knowledgeService = await _resolveDependencies(connectionId) if runExtractionFn is None: from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction from modules.serviceCenter.services.serviceExtraction.subRegistry import ( ExtractorRegistry, ChunkerRegistry, ) extractorRegistry = ExtractorRegistry() chunkerRegistry = ChunkerRegistry() def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef] return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options) mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" cancelled = False for ds in dataSources: if result.indexed + result.skippedDuplicate >= limits.maxItems: break if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled(): cancelled = True break dsPath = ds.get("path", "/") dsId = ds.get("id", "") dsNeutralize = ds.get("neutralize", False) dsMaxAgeDays = ds.get("maxAgeDays", limits.maxAgeDays) dsLimits = GdriveBootstrapLimits( maxItems=limits.maxItems, maxBytes=limits.maxBytes, maxFileSize=limits.maxFileSize, skipMimePrefixes=limits.skipMimePrefixes, maxDepth=limits.maxDepth, maxAgeDays=dsMaxAgeDays, neutralize=dsNeutralize, ) try: await _walkFolder( adapter=adapter, knowledgeService=knowledgeService, runExtractionFn=runExtractionFn, connectionId=connectionId, mandateId=mandateId, userId=userId, folderPath=dsPath, depth=0, limits=dsLimits, result=result, progressCb=progressCb, dataSourceId=dsId, ) except Exception as exc: logger.error("gdrive walk failed for ds %s path %s: %s", dsId, dsPath, exc, exc_info=True) result.errors.append(f"walk({dsPath}): {exc}") finalResult = _finalizeResult(connectionId, result, startMs) if cancelled: finalResult["cancelled"] = True return finalResult async def _resolveDependencies(connectionId: str): from modules.interfaces.interfaceDbApp import getRootInterface from modules.auth import TokenManager from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector from modules.serviceCenter import getService from modules.serviceCenter.context import ServiceCenterContext from modules.security.rootAccess import getRootUser rootInterface = getRootInterface() connection = rootInterface.getUserConnectionById(connectionId) if connection is None: raise ValueError(f"UserConnection not found: {connectionId}") token = TokenManager().getFreshToken(connectionId) if not token or not token.tokenAccess: raise ValueError(f"No valid token for connection {connectionId}") provider = GoogleConnector(connection, token.tokenAccess) adapter = provider.getServiceAdapter("drive") rootUser = getRootUser() ctx = ServiceCenterContext( user=rootUser, mandate_id=str(getattr(connection, "mandateId", "") or ""), ) knowledgeService = getService("knowledge", ctx) return adapter, connection, knowledgeService async def _walkFolder( *, adapter, knowledgeService, runExtractionFn, connectionId: str, mandateId: str, userId: str, folderPath: str, depth: int, limits: GdriveBootstrapLimits, result: GdriveBootstrapResult, progressCb: Optional[Any], dataSourceId: str = "", ) -> None: if depth > limits.maxDepth: return if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled(): return try: entries = await adapter.browse(folderPath) except Exception as exc: logger.warning("gdrive browse %s failed: %s", folderPath, exc) result.errors.append(f"browse({folderPath}): {exc}") return for entry in entries: if result.indexed + result.skippedDuplicate >= limits.maxItems: return if result.bytesProcessed >= limits.maxBytes: return if progressCb and hasattr(progressCb, "isCancelled") and (result.indexed + result.skippedDuplicate) % 50 == 0 and progressCb.isCancelled(): return entryPath = getattr(entry, "path", "") or "" metadata = getattr(entry, "metadata", {}) or {} mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType") if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME: await _walkFolder( adapter=adapter, knowledgeService=knowledgeService, runExtractionFn=runExtractionFn, connectionId=connectionId, mandateId=mandateId, userId=userId, folderPath=entryPath, depth=depth + 1, limits=limits, result=result, progressCb=progressCb, dataSourceId=dataSourceId, ) continue effectiveMime = mimeType or "application/octet-stream" if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes): result.skippedPolicy += 1 continue size = int(getattr(entry, "size", 0) or 0) if size and size > limits.maxFileSize: result.skippedPolicy += 1 continue modifiedTime = metadata.get("modifiedTime") if not _isRecent(modifiedTime, limits.maxAgeDays): result.skippedPolicy += 1 continue externalItemId = metadata.get("id") or entryPath revision = modifiedTime await _ingestOne( adapter=adapter, knowledgeService=knowledgeService, runExtractionFn=runExtractionFn, connectionId=connectionId, mandateId=mandateId, userId=userId, entry=entry, entryPath=entryPath, mimeType=effectiveMime, externalItemId=externalItemId, revision=revision, limits=limits, result=result, progressCb=progressCb, dataSourceId=dataSourceId, ) async def _ingestOne( *, adapter, knowledgeService, runExtractionFn, connectionId: str, mandateId: str, userId: str, entry, entryPath: str, mimeType: str, externalItemId: str, revision: Optional[str], limits: GdriveBootstrapLimits, result: GdriveBootstrapResult, progressCb: Optional[Any], dataSourceId: str = "", ) -> None: from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob syntheticFileId = _syntheticFileId(connectionId, externalItemId) fileName = getattr(entry, "name", "") or externalItemId try: downloaded = await adapter.download(entryPath) except Exception as exc: logger.warning("gdrive download %s failed: %s", entryPath, exc) result.failed += 1 result.errors.append(f"download({entryPath}): {exc}") return fileBytes: bytes if isinstance(downloaded, (bytes, bytearray)): fileBytes = bytes(downloaded) else: fileBytes = bytes(getattr(downloaded, "data", b"") or b"") if getattr(downloaded, "mimeType", None): mimeType = downloaded.mimeType if not fileBytes: result.failed += 1 return if len(fileBytes) > limits.maxFileSize: result.skippedPolicy += 1 return result.bytesProcessed += len(fileBytes) try: extracted = runExtractionFn( fileBytes, fileName, mimeType, ExtractionOptions(mergeStrategy=None), ) except Exception as exc: logger.warning("gdrive extraction %s failed: %s", entryPath, exc) result.failed += 1 result.errors.append(f"extract({entryPath}): {exc}") return contentObjects = _toContentObjects(extracted, fileName) if not contentObjects: result.skippedPolicy += 1 return provenance: Dict[str, Any] = { "connectionId": connectionId, "dataSourceId": dataSourceId, "authority": "google", "service": "drive", "externalItemId": externalItemId, "entryPath": entryPath, "tier": "body", } try: handle = await knowledgeService.requestIngestion( IngestionJob( sourceKind="gdrive_item", sourceId=syntheticFileId, fileName=fileName, mimeType=mimeType, userId=userId, mandateId=mandateId, contentObjects=contentObjects, contentVersion=revision, neutralize=limits.neutralize, provenance=provenance, ) ) except Exception as exc: logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True) result.failed += 1 result.errors.append(f"ingest({entryPath}): {exc}") return if handle.status == "duplicate": result.skippedDuplicate += 1 elif handle.status == "indexed": result.indexed += 1 else: result.failed += 1 if handle.error: result.errors.append(f"ingest({entryPath}): {handle.error}") if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: processed = result.indexed + result.skippedDuplicate try: progressCb( min(90, 10 + int(80 * processed / max(1, limits.maxItems))), f"gdrive processed={processed}", ) except Exception: pass logger.info( "ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d", processed, result.skippedDuplicate, result.failed, extra={ "event": "ingestion.connection.bootstrap.progress", "part": "gdrive", "connectionId": connectionId, "processed": processed, "skippedDup": result.skippedDuplicate, "failed": result.failed, }, ) await asyncio.sleep(0) def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]: durationMs = int((time.time() - startMs) * 1000) logger.info( "ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d", connectionId, result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed, result.bytesProcessed, durationMs, extra={ "event": "ingestion.connection.bootstrap.done", "part": "gdrive", "connectionId": connectionId, "indexed": result.indexed, "skippedDup": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "failed": result.failed, "bytes": result.bytesProcessed, "durationMs": durationMs, }, ) return { "connectionId": result.connectionId, "indexed": result.indexed, "skippedDuplicate": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "failed": result.failed, "bytesProcessed": result.bytesProcessed, "durationMs": durationMs, "errors": result.errors[:20], }