443 lines
15 KiB
Python
443 lines
15 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Google Drive bootstrap for the unified knowledge ingestion lane.
|
|
|
|
Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the
|
|
user's *My Drive* tree from the virtual `root` folder, downloads each
|
|
file-like item via `DriveAdapter.download` (which handles native Google docs
|
|
via export), runs the standard extraction pipeline and routes results through
|
|
`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and
|
|
`contentVersion = modifiedTime` (monotonic per-revision).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import logging
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timedelta, timezone
|
|
from typing import Any, Callable, Dict, List, Optional
|
|
|
|
from modules.datamodels.datamodelExtraction import ExtractionOptions
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MAX_ITEMS_DEFAULT = 500
|
|
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
|
|
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
|
|
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
|
|
MAX_DEPTH_DEFAULT = 4
|
|
MAX_AGE_DAYS_DEFAULT = 365
|
|
|
|
# Google Drive uses virtual mime-types for folders and non-downloadable assets.
|
|
FOLDER_MIME = "application/vnd.google-apps.folder"
|
|
|
|
|
|
@dataclass
|
|
class GdriveBootstrapLimits:
|
|
maxItems: int = MAX_ITEMS_DEFAULT
|
|
maxBytes: int = MAX_BYTES_DEFAULT
|
|
maxFileSize: int = MAX_FILE_SIZE_DEFAULT
|
|
skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
|
|
maxDepth: int = MAX_DEPTH_DEFAULT
|
|
# Only ingest files modified within the last N days. None disables filter.
|
|
maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
|
|
# Pass-through to IngestionJob.neutralize
|
|
neutralize: bool = False
|
|
# Whether to skip binary/non-text files
|
|
filesIndexBinaries: bool = True
|
|
|
|
|
|
@dataclass
|
|
class GdriveBootstrapResult:
|
|
connectionId: str
|
|
indexed: int = 0
|
|
skippedDuplicate: int = 0
|
|
skippedPolicy: int = 0
|
|
failed: int = 0
|
|
bytesProcessed: int = 0
|
|
errors: List[str] = field(default_factory=list)
|
|
|
|
|
|
def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
|
|
token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
|
|
return f"gd:{connectionId[:8]}:{token}"
|
|
|
|
|
|
def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
|
|
parts = getattr(extracted, "parts", None) or []
|
|
out: List[Dict[str, Any]] = []
|
|
for part in parts:
|
|
data = getattr(part, "data", None) or ""
|
|
if not data or not str(data).strip():
|
|
continue
|
|
typeGroup = getattr(part, "typeGroup", "text") or "text"
|
|
contentType = "text"
|
|
if typeGroup == "image":
|
|
contentType = "image"
|
|
elif typeGroup in ("binary", "container"):
|
|
contentType = "other"
|
|
out.append({
|
|
"contentObjectId": getattr(part, "id", ""),
|
|
"contentType": contentType,
|
|
"data": data,
|
|
"contextRef": {
|
|
"containerPath": fileName,
|
|
"location": getattr(part, "label", None) or "file",
|
|
**(getattr(part, "metadata", None) or {}),
|
|
},
|
|
})
|
|
return out
|
|
|
|
|
|
def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
|
|
if not maxAgeDays:
|
|
return True
|
|
if not modifiedIso:
|
|
# No timestamp -> be permissive (Drive native docs sometimes omit it on export).
|
|
return True
|
|
try:
|
|
# Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both.
|
|
ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00"))
|
|
except Exception:
|
|
return True
|
|
cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
|
|
if ts.tzinfo is None:
|
|
ts = ts.replace(tzinfo=timezone.utc)
|
|
return ts >= cutoff
|
|
|
|
|
|
async def bootstrapGdrive(
|
|
connectionId: str,
|
|
*,
|
|
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
|
|
adapter: Any = None,
|
|
connection: Any = None,
|
|
knowledgeService: Any = None,
|
|
limits: Optional[GdriveBootstrapLimits] = None,
|
|
runExtractionFn: Optional[Callable[..., Any]] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Walk My Drive starting from the virtual root folder."""
|
|
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
|
|
prefs = loadConnectionPrefs(connectionId)
|
|
|
|
if not limits:
|
|
limits = GdriveBootstrapLimits(
|
|
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
|
|
neutralize=prefs.neutralizeBeforeEmbed,
|
|
filesIndexBinaries=prefs.filesIndexBinaries,
|
|
)
|
|
|
|
startMs = time.time()
|
|
result = GdriveBootstrapResult(connectionId=connectionId)
|
|
|
|
logger.info(
|
|
"ingestion.connection.bootstrap.started part=gdrive connectionId=%s",
|
|
connectionId,
|
|
extra={
|
|
"event": "ingestion.connection.bootstrap.started",
|
|
"part": "gdrive",
|
|
"connectionId": connectionId,
|
|
},
|
|
)
|
|
|
|
if adapter is None or knowledgeService is None or connection is None:
|
|
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
|
|
if runExtractionFn is None:
|
|
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
|
|
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
|
|
ExtractorRegistry, ChunkerRegistry,
|
|
)
|
|
extractorRegistry = ExtractorRegistry()
|
|
chunkerRegistry = ChunkerRegistry()
|
|
|
|
def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
|
|
return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
|
|
|
|
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
|
|
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
|
|
|
|
try:
|
|
await _walkFolder(
|
|
adapter=adapter,
|
|
knowledgeService=knowledgeService,
|
|
runExtractionFn=runExtractionFn,
|
|
connectionId=connectionId,
|
|
mandateId=mandateId,
|
|
userId=userId,
|
|
folderPath="/", # DriveAdapter.browse maps "" / "/" -> "root"
|
|
depth=0,
|
|
limits=limits,
|
|
result=result,
|
|
progressCb=progressCb,
|
|
)
|
|
except Exception as exc:
|
|
logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True)
|
|
result.errors.append(f"walk: {exc}")
|
|
|
|
return _finalizeResult(connectionId, result, startMs)
|
|
|
|
|
|
async def _resolveDependencies(connectionId: str):
|
|
from modules.interfaces.interfaceDbApp import getRootInterface
|
|
from modules.auth import TokenManager
|
|
from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
|
|
from modules.serviceCenter import getService
|
|
from modules.serviceCenter.context import ServiceCenterContext
|
|
from modules.security.rootAccess import getRootUser
|
|
|
|
rootInterface = getRootInterface()
|
|
connection = rootInterface.getUserConnectionById(connectionId)
|
|
if connection is None:
|
|
raise ValueError(f"UserConnection not found: {connectionId}")
|
|
|
|
token = TokenManager().getFreshToken(connectionId)
|
|
if not token or not token.tokenAccess:
|
|
raise ValueError(f"No valid token for connection {connectionId}")
|
|
|
|
provider = GoogleConnector(connection, token.tokenAccess)
|
|
adapter = provider.getServiceAdapter("drive")
|
|
|
|
rootUser = getRootUser()
|
|
ctx = ServiceCenterContext(
|
|
user=rootUser,
|
|
mandate_id=str(getattr(connection, "mandateId", "") or ""),
|
|
)
|
|
knowledgeService = getService("knowledge", ctx)
|
|
return adapter, connection, knowledgeService
|
|
|
|
|
|
async def _walkFolder(
|
|
*,
|
|
adapter,
|
|
knowledgeService,
|
|
runExtractionFn,
|
|
connectionId: str,
|
|
mandateId: str,
|
|
userId: str,
|
|
folderPath: str,
|
|
depth: int,
|
|
limits: GdriveBootstrapLimits,
|
|
result: GdriveBootstrapResult,
|
|
progressCb: Optional[Callable[[int, Optional[str]], None]],
|
|
) -> None:
|
|
if depth > limits.maxDepth:
|
|
return
|
|
try:
|
|
entries = await adapter.browse(folderPath)
|
|
except Exception as exc:
|
|
logger.warning("gdrive browse %s failed: %s", folderPath, exc)
|
|
result.errors.append(f"browse({folderPath}): {exc}")
|
|
return
|
|
|
|
for entry in entries:
|
|
if result.indexed + result.skippedDuplicate >= limits.maxItems:
|
|
return
|
|
if result.bytesProcessed >= limits.maxBytes:
|
|
return
|
|
|
|
entryPath = getattr(entry, "path", "") or ""
|
|
metadata = getattr(entry, "metadata", {}) or {}
|
|
mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType")
|
|
|
|
if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME:
|
|
await _walkFolder(
|
|
adapter=adapter,
|
|
knowledgeService=knowledgeService,
|
|
runExtractionFn=runExtractionFn,
|
|
connectionId=connectionId,
|
|
mandateId=mandateId,
|
|
userId=userId,
|
|
folderPath=entryPath,
|
|
depth=depth + 1,
|
|
limits=limits,
|
|
result=result,
|
|
progressCb=progressCb,
|
|
)
|
|
continue
|
|
|
|
effectiveMime = mimeType or "application/octet-stream"
|
|
if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes):
|
|
result.skippedPolicy += 1
|
|
continue
|
|
size = int(getattr(entry, "size", 0) or 0)
|
|
if size and size > limits.maxFileSize:
|
|
result.skippedPolicy += 1
|
|
continue
|
|
modifiedTime = metadata.get("modifiedTime")
|
|
if not _isRecent(modifiedTime, limits.maxAgeDays):
|
|
result.skippedPolicy += 1
|
|
continue
|
|
|
|
externalItemId = metadata.get("id") or entryPath
|
|
revision = modifiedTime
|
|
|
|
await _ingestOne(
|
|
adapter=adapter,
|
|
knowledgeService=knowledgeService,
|
|
runExtractionFn=runExtractionFn,
|
|
connectionId=connectionId,
|
|
mandateId=mandateId,
|
|
userId=userId,
|
|
entry=entry,
|
|
entryPath=entryPath,
|
|
mimeType=effectiveMime,
|
|
externalItemId=externalItemId,
|
|
revision=revision,
|
|
limits=limits,
|
|
result=result,
|
|
progressCb=progressCb,
|
|
)
|
|
|
|
|
|
async def _ingestOne(
|
|
*,
|
|
adapter,
|
|
knowledgeService,
|
|
runExtractionFn,
|
|
connectionId: str,
|
|
mandateId: str,
|
|
userId: str,
|
|
entry,
|
|
entryPath: str,
|
|
mimeType: str,
|
|
externalItemId: str,
|
|
revision: Optional[str],
|
|
limits: GdriveBootstrapLimits,
|
|
result: GdriveBootstrapResult,
|
|
progressCb: Optional[Callable[[int, Optional[str]], None]],
|
|
) -> None:
|
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
|
|
|
|
syntheticFileId = _syntheticFileId(connectionId, externalItemId)
|
|
fileName = getattr(entry, "name", "") or externalItemId
|
|
|
|
try:
|
|
downloaded = await adapter.download(entryPath)
|
|
except Exception as exc:
|
|
logger.warning("gdrive download %s failed: %s", entryPath, exc)
|
|
result.failed += 1
|
|
result.errors.append(f"download({entryPath}): {exc}")
|
|
return
|
|
|
|
# Adapter.download returns raw bytes today; guard DownloadResult shape too.
|
|
fileBytes: bytes
|
|
if isinstance(downloaded, (bytes, bytearray)):
|
|
fileBytes = bytes(downloaded)
|
|
else:
|
|
fileBytes = bytes(getattr(downloaded, "data", b"") or b"")
|
|
if getattr(downloaded, "mimeType", None):
|
|
mimeType = downloaded.mimeType # export may have changed the type
|
|
if not fileBytes:
|
|
result.failed += 1
|
|
return
|
|
if len(fileBytes) > limits.maxFileSize:
|
|
result.skippedPolicy += 1
|
|
return
|
|
|
|
result.bytesProcessed += len(fileBytes)
|
|
|
|
try:
|
|
extracted = runExtractionFn(
|
|
fileBytes, fileName, mimeType,
|
|
ExtractionOptions(mergeStrategy=None),
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("gdrive extraction %s failed: %s", entryPath, exc)
|
|
result.failed += 1
|
|
result.errors.append(f"extract({entryPath}): {exc}")
|
|
return
|
|
|
|
contentObjects = _toContentObjects(extracted, fileName)
|
|
if not contentObjects:
|
|
result.skippedPolicy += 1
|
|
return
|
|
|
|
try:
|
|
handle = await knowledgeService.requestIngestion(
|
|
IngestionJob(
|
|
sourceKind="gdrive_item",
|
|
sourceId=syntheticFileId,
|
|
fileName=fileName,
|
|
mimeType=mimeType,
|
|
userId=userId,
|
|
mandateId=mandateId,
|
|
contentObjects=contentObjects,
|
|
contentVersion=revision,
|
|
neutralize=limits.neutralize,
|
|
provenance={
|
|
"connectionId": connectionId,
|
|
"authority": "google",
|
|
"service": "drive",
|
|
"externalItemId": externalItemId,
|
|
"entryPath": entryPath,
|
|
"tier": "body",
|
|
},
|
|
)
|
|
)
|
|
except Exception as exc:
|
|
logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True)
|
|
result.failed += 1
|
|
result.errors.append(f"ingest({entryPath}): {exc}")
|
|
return
|
|
|
|
if handle.status == "duplicate":
|
|
result.skippedDuplicate += 1
|
|
elif handle.status == "indexed":
|
|
result.indexed += 1
|
|
else:
|
|
result.failed += 1
|
|
|
|
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
|
|
processed = result.indexed + result.skippedDuplicate
|
|
try:
|
|
progressCb(
|
|
min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
|
|
f"gdrive processed={processed}",
|
|
)
|
|
except Exception:
|
|
pass
|
|
logger.info(
|
|
"ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d",
|
|
processed, result.skippedDuplicate, result.failed,
|
|
extra={
|
|
"event": "ingestion.connection.bootstrap.progress",
|
|
"part": "gdrive",
|
|
"connectionId": connectionId,
|
|
"processed": processed,
|
|
"skippedDup": result.skippedDuplicate,
|
|
"failed": result.failed,
|
|
},
|
|
)
|
|
|
|
|
|
def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]:
|
|
durationMs = int((time.time() - startMs) * 1000)
|
|
logger.info(
|
|
"ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d",
|
|
connectionId,
|
|
result.indexed, result.skippedDuplicate, result.skippedPolicy,
|
|
result.failed, result.bytesProcessed, durationMs,
|
|
extra={
|
|
"event": "ingestion.connection.bootstrap.done",
|
|
"part": "gdrive",
|
|
"connectionId": connectionId,
|
|
"indexed": result.indexed,
|
|
"skippedDup": result.skippedDuplicate,
|
|
"skippedPolicy": result.skippedPolicy,
|
|
"failed": result.failed,
|
|
"bytes": result.bytesProcessed,
|
|
"durationMs": durationMs,
|
|
},
|
|
)
|
|
return {
|
|
"connectionId": result.connectionId,
|
|
"indexed": result.indexed,
|
|
"skippedDuplicate": result.skippedDuplicate,
|
|
"skippedPolicy": result.skippedPolicy,
|
|
"failed": result.failed,
|
|
"bytesProcessed": result.bytesProcessed,
|
|
"durationMs": durationMs,
|
|
"errors": result.errors[:20],
|
|
}
|