# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Google Drive bootstrap for the unified knowledge ingestion lane. Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the user's *My Drive* tree from the virtual `root` folder, downloads each file-like item via `DriveAdapter.download` (which handles native Google docs via export), runs the standard extraction pipeline and routes results through `KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and `contentVersion = modifiedTime` (monotonic per-revision). """ from __future__ import annotations import hashlib import logging import time from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone from typing import Any, Callable, Dict, List, Optional from modules.datamodels.datamodelExtraction import ExtractionOptions logger = logging.getLogger(__name__) MAX_ITEMS_DEFAULT = 500 MAX_BYTES_DEFAULT = 200 * 1024 * 1024 MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024 SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/") MAX_DEPTH_DEFAULT = 4 MAX_AGE_DAYS_DEFAULT = 365 # Google Drive uses virtual mime-types for folders and non-downloadable assets. FOLDER_MIME = "application/vnd.google-apps.folder" @dataclass class GdriveBootstrapLimits: maxItems: int = MAX_ITEMS_DEFAULT maxBytes: int = MAX_BYTES_DEFAULT maxFileSize: int = MAX_FILE_SIZE_DEFAULT skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT maxDepth: int = MAX_DEPTH_DEFAULT # Only ingest files modified within the last N days. None disables filter. maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT # Pass-through to IngestionJob.neutralize neutralize: bool = False # Whether to skip binary/non-text files filesIndexBinaries: bool = True @dataclass class GdriveBootstrapResult: connectionId: str indexed: int = 0 skippedDuplicate: int = 0 skippedPolicy: int = 0 failed: int = 0 bytesProcessed: int = 0 errors: List[str] = field(default_factory=list) def _syntheticFileId(connectionId: str, externalItemId: str) -> str: token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16] return f"gd:{connectionId[:8]}:{token}" def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]: parts = getattr(extracted, "parts", None) or [] out: List[Dict[str, Any]] = [] for part in parts: data = getattr(part, "data", None) or "" if not data or not str(data).strip(): continue typeGroup = getattr(part, "typeGroup", "text") or "text" contentType = "text" if typeGroup == "image": contentType = "image" elif typeGroup in ("binary", "container"): contentType = "other" out.append({ "contentObjectId": getattr(part, "id", ""), "contentType": contentType, "data": data, "contextRef": { "containerPath": fileName, "location": getattr(part, "label", None) or "file", **(getattr(part, "metadata", None) or {}), }, }) return out def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool: if not maxAgeDays: return True if not modifiedIso: # No timestamp -> be permissive (Drive native docs sometimes omit it on export). return True try: # Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both. ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00")) except Exception: return True cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays) if ts.tzinfo is None: ts = ts.replace(tzinfo=timezone.utc) return ts >= cutoff async def bootstrapGdrive( connectionId: str, *, progressCb: Optional[Callable[[int, Optional[str]], None]] = None, adapter: Any = None, connection: Any = None, knowledgeService: Any = None, limits: Optional[GdriveBootstrapLimits] = None, runExtractionFn: Optional[Callable[..., Any]] = None, ) -> Dict[str, Any]: """Walk My Drive starting from the virtual root folder.""" from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs prefs = loadConnectionPrefs(connectionId) if not limits: limits = GdriveBootstrapLimits( maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, neutralize=prefs.neutralizeBeforeEmbed, filesIndexBinaries=prefs.filesIndexBinaries, ) startMs = time.time() result = GdriveBootstrapResult(connectionId=connectionId) logger.info( "ingestion.connection.bootstrap.started part=gdrive connectionId=%s", connectionId, extra={ "event": "ingestion.connection.bootstrap.started", "part": "gdrive", "connectionId": connectionId, }, ) if adapter is None or knowledgeService is None or connection is None: adapter, connection, knowledgeService = await _resolveDependencies(connectionId) if runExtractionFn is None: from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction from modules.serviceCenter.services.serviceExtraction.subRegistry import ( ExtractorRegistry, ChunkerRegistry, ) extractorRegistry = ExtractorRegistry() chunkerRegistry = ChunkerRegistry() def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef] return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options) mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" try: await _walkFolder( adapter=adapter, knowledgeService=knowledgeService, runExtractionFn=runExtractionFn, connectionId=connectionId, mandateId=mandateId, userId=userId, folderPath="/", # DriveAdapter.browse maps "" / "/" -> "root" depth=0, limits=limits, result=result, progressCb=progressCb, ) except Exception as exc: logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True) result.errors.append(f"walk: {exc}") return _finalizeResult(connectionId, result, startMs) async def _resolveDependencies(connectionId: str): from modules.interfaces.interfaceDbApp import getRootInterface from modules.auth import TokenManager from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector from modules.serviceCenter import getService from modules.serviceCenter.context import ServiceCenterContext from modules.security.rootAccess import getRootUser rootInterface = getRootInterface() connection = rootInterface.getUserConnectionById(connectionId) if connection is None: raise ValueError(f"UserConnection not found: {connectionId}") token = TokenManager().getFreshToken(connectionId) if not token or not token.tokenAccess: raise ValueError(f"No valid token for connection {connectionId}") provider = GoogleConnector(connection, token.tokenAccess) adapter = provider.getServiceAdapter("drive") rootUser = getRootUser() ctx = ServiceCenterContext( user=rootUser, mandate_id=str(getattr(connection, "mandateId", "") or ""), ) knowledgeService = getService("knowledge", ctx) return adapter, connection, knowledgeService async def _walkFolder( *, adapter, knowledgeService, runExtractionFn, connectionId: str, mandateId: str, userId: str, folderPath: str, depth: int, limits: GdriveBootstrapLimits, result: GdriveBootstrapResult, progressCb: Optional[Callable[[int, Optional[str]], None]], ) -> None: if depth > limits.maxDepth: return try: entries = await adapter.browse(folderPath) except Exception as exc: logger.warning("gdrive browse %s failed: %s", folderPath, exc) result.errors.append(f"browse({folderPath}): {exc}") return for entry in entries: if result.indexed + result.skippedDuplicate >= limits.maxItems: return if result.bytesProcessed >= limits.maxBytes: return entryPath = getattr(entry, "path", "") or "" metadata = getattr(entry, "metadata", {}) or {} mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType") if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME: await _walkFolder( adapter=adapter, knowledgeService=knowledgeService, runExtractionFn=runExtractionFn, connectionId=connectionId, mandateId=mandateId, userId=userId, folderPath=entryPath, depth=depth + 1, limits=limits, result=result, progressCb=progressCb, ) continue effectiveMime = mimeType or "application/octet-stream" if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes): result.skippedPolicy += 1 continue size = int(getattr(entry, "size", 0) or 0) if size and size > limits.maxFileSize: result.skippedPolicy += 1 continue modifiedTime = metadata.get("modifiedTime") if not _isRecent(modifiedTime, limits.maxAgeDays): result.skippedPolicy += 1 continue externalItemId = metadata.get("id") or entryPath revision = modifiedTime await _ingestOne( adapter=adapter, knowledgeService=knowledgeService, runExtractionFn=runExtractionFn, connectionId=connectionId, mandateId=mandateId, userId=userId, entry=entry, entryPath=entryPath, mimeType=effectiveMime, externalItemId=externalItemId, revision=revision, limits=limits, result=result, progressCb=progressCb, ) async def _ingestOne( *, adapter, knowledgeService, runExtractionFn, connectionId: str, mandateId: str, userId: str, entry, entryPath: str, mimeType: str, externalItemId: str, revision: Optional[str], limits: GdriveBootstrapLimits, result: GdriveBootstrapResult, progressCb: Optional[Callable[[int, Optional[str]], None]], ) -> None: from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob syntheticFileId = _syntheticFileId(connectionId, externalItemId) fileName = getattr(entry, "name", "") or externalItemId try: downloaded = await adapter.download(entryPath) except Exception as exc: logger.warning("gdrive download %s failed: %s", entryPath, exc) result.failed += 1 result.errors.append(f"download({entryPath}): {exc}") return # Adapter.download returns raw bytes today; guard DownloadResult shape too. fileBytes: bytes if isinstance(downloaded, (bytes, bytearray)): fileBytes = bytes(downloaded) else: fileBytes = bytes(getattr(downloaded, "data", b"") or b"") if getattr(downloaded, "mimeType", None): mimeType = downloaded.mimeType # export may have changed the type if not fileBytes: result.failed += 1 return if len(fileBytes) > limits.maxFileSize: result.skippedPolicy += 1 return result.bytesProcessed += len(fileBytes) try: extracted = runExtractionFn( fileBytes, fileName, mimeType, ExtractionOptions(mergeStrategy=None), ) except Exception as exc: logger.warning("gdrive extraction %s failed: %s", entryPath, exc) result.failed += 1 result.errors.append(f"extract({entryPath}): {exc}") return contentObjects = _toContentObjects(extracted, fileName) if not contentObjects: result.skippedPolicy += 1 return try: handle = await knowledgeService.requestIngestion( IngestionJob( sourceKind="gdrive_item", sourceId=syntheticFileId, fileName=fileName, mimeType=mimeType, userId=userId, mandateId=mandateId, contentObjects=contentObjects, contentVersion=revision, neutralize=limits.neutralize, provenance={ "connectionId": connectionId, "authority": "google", "service": "drive", "externalItemId": externalItemId, "entryPath": entryPath, "tier": "body", }, ) ) except Exception as exc: logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True) result.failed += 1 result.errors.append(f"ingest({entryPath}): {exc}") return if handle.status == "duplicate": result.skippedDuplicate += 1 elif handle.status == "indexed": result.indexed += 1 else: result.failed += 1 if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: processed = result.indexed + result.skippedDuplicate try: progressCb( min(90, 10 + int(80 * processed / max(1, limits.maxItems))), f"gdrive processed={processed}", ) except Exception: pass logger.info( "ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d", processed, result.skippedDuplicate, result.failed, extra={ "event": "ingestion.connection.bootstrap.progress", "part": "gdrive", "connectionId": connectionId, "processed": processed, "skippedDup": result.skippedDuplicate, "failed": result.failed, }, ) def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]: durationMs = int((time.time() - startMs) * 1000) logger.info( "ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d", connectionId, result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed, result.bytesProcessed, durationMs, extra={ "event": "ingestion.connection.bootstrap.done", "part": "gdrive", "connectionId": connectionId, "indexed": result.indexed, "skippedDup": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "failed": result.failed, "bytes": result.bytesProcessed, "durationMs": durationMs, }, ) return { "connectionId": result.connectionId, "indexed": result.indexed, "skippedDuplicate": result.skippedDuplicate, "skippedPolicy": result.skippedPolicy, "failed": result.failed, "bytesProcessed": result.bytesProcessed, "durationMs": durationMs, "errors": result.errors[:20], }