gateway/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py
2026-04-29 14:39:40 +02:00

429 lines
15 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Google Drive bootstrap for the unified knowledge ingestion lane.
Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the
user's *My Drive* tree from the virtual `root` folder, downloads each
file-like item via `DriveAdapter.download` (which handles native Google docs
via export), runs the standard extraction pipeline and routes results through
`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and
`contentVersion = modifiedTime` (monotonic per-revision).
"""
from __future__ import annotations
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional
from modules.datamodels.datamodelExtraction import ExtractionOptions
logger = logging.getLogger(__name__)
MAX_ITEMS_DEFAULT = 500
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4
MAX_AGE_DAYS_DEFAULT = 365
# Google Drive uses virtual mime-types for folders and non-downloadable assets.
FOLDER_MIME = "application/vnd.google-apps.folder"
@dataclass
class GdriveBootstrapLimits:
maxItems: int = MAX_ITEMS_DEFAULT
maxBytes: int = MAX_BYTES_DEFAULT
maxFileSize: int = MAX_FILE_SIZE_DEFAULT
skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
maxDepth: int = MAX_DEPTH_DEFAULT
# Only ingest files modified within the last N days. None disables filter.
maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
@dataclass
class GdriveBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
bytesProcessed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
return f"gd:{connectionId[:8]}:{token}"
def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
parts = getattr(extracted, "parts", None) or []
out: List[Dict[str, Any]] = []
for part in parts:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
out.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "file",
**(getattr(part, "metadata", None) or {}),
},
})
return out
def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
if not maxAgeDays:
return True
if not modifiedIso:
# No timestamp -> be permissive (Drive native docs sometimes omit it on export).
return True
try:
# Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both.
ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00"))
except Exception:
return True
cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
if ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
return ts >= cutoff
async def bootstrapGdrive(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[GdriveBootstrapLimits] = None,
runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
"""Walk My Drive starting from the virtual root folder."""
limits = limits or GdriveBootstrapLimits()
startMs = time.time()
result = GdriveBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=gdrive connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "gdrive",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
if runExtractionFn is None:
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
try:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath="/", # DriveAdapter.browse maps "" / "/" -> "root"
depth=0,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"walk: {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = GoogleConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("drive")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _walkFolder(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
folderPath: str,
depth: int,
limits: GdriveBootstrapLimits,
result: GdriveBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
if depth > limits.maxDepth:
return
try:
entries = await adapter.browse(folderPath)
except Exception as exc:
logger.warning("gdrive browse %s failed: %s", folderPath, exc)
result.errors.append(f"browse({folderPath}): {exc}")
return
for entry in entries:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
return
if result.bytesProcessed >= limits.maxBytes:
return
entryPath = getattr(entry, "path", "") or ""
metadata = getattr(entry, "metadata", {}) or {}
mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType")
if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=entryPath,
depth=depth + 1,
limits=limits,
result=result,
progressCb=progressCb,
)
continue
effectiveMime = mimeType or "application/octet-stream"
if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes):
result.skippedPolicy += 1
continue
size = int(getattr(entry, "size", 0) or 0)
if size and size > limits.maxFileSize:
result.skippedPolicy += 1
continue
modifiedTime = metadata.get("modifiedTime")
if not _isRecent(modifiedTime, limits.maxAgeDays):
result.skippedPolicy += 1
continue
externalItemId = metadata.get("id") or entryPath
revision = modifiedTime
await _ingestOne(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
entry=entry,
entryPath=entryPath,
mimeType=effectiveMime,
externalItemId=externalItemId,
revision=revision,
limits=limits,
result=result,
progressCb=progressCb,
)
async def _ingestOne(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
entry,
entryPath: str,
mimeType: str,
externalItemId: str,
revision: Optional[str],
limits: GdriveBootstrapLimits,
result: GdriveBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
syntheticFileId = _syntheticFileId(connectionId, externalItemId)
fileName = getattr(entry, "name", "") or externalItemId
try:
downloaded = await adapter.download(entryPath)
except Exception as exc:
logger.warning("gdrive download %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"download({entryPath}): {exc}")
return
# Adapter.download returns raw bytes today; guard DownloadResult shape too.
fileBytes: bytes
if isinstance(downloaded, (bytes, bytearray)):
fileBytes = bytes(downloaded)
else:
fileBytes = bytes(getattr(downloaded, "data", b"") or b"")
if getattr(downloaded, "mimeType", None):
mimeType = downloaded.mimeType # export may have changed the type
if not fileBytes:
result.failed += 1
return
if len(fileBytes) > limits.maxFileSize:
result.skippedPolicy += 1
return
result.bytesProcessed += len(fileBytes)
try:
extracted = runExtractionFn(
fileBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("gdrive extraction %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"extract({entryPath}): {exc}")
return
contentObjects = _toContentObjects(extracted, fileName)
if not contentObjects:
result.skippedPolicy += 1
return
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="gdrive_item",
sourceId=syntheticFileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
provenance={
"connectionId": connectionId,
"authority": "google",
"service": "drive",
"externalItemId": externalItemId,
"entryPath": entryPath,
"tier": "body",
},
)
)
except Exception as exc:
logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({entryPath}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
f"gdrive processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "gdrive",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.failed, result.bytesProcessed, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "gdrive",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytes": result.bytesProcessed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytesProcessed": result.bytesProcessed,
"durationMs": durationMs,
"errors": result.errors[:20],
}