diff --git a/app.py b/app.py index adcd5365..98e3bd0d 100644 --- a/app.py +++ b/app.py @@ -405,6 +405,16 @@ async def lifespan(app: FastAPI): except Exception as e: logger.warning(f"BackgroundJob recovery failed (non-critical): {e}") + # Subscribe knowledge ingestion to connection lifecycle events so OAuth + # connect/disconnect reliably trigger bootstrap/purge. + try: + from modules.serviceCenter.services.serviceKnowledge.subConnectorIngestConsumer import ( + registerKnowledgeIngestionConsumer, + ) + registerKnowledgeIngestionConsumer() + except Exception as e: + logger.warning(f"KnowledgeIngestionConsumer registration failed (non-critical): {e}") + yield # --- Stop Managers --- diff --git a/modules/connectors/providerMsft/connectorMsft.py b/modules/connectors/providerMsft/connectorMsft.py index bf290eca..49f6fdaa 100644 --- a/modules/connectors/providerMsft/connectorMsft.py +++ b/modules/connectors/providerMsft/connectorMsft.py @@ -126,6 +126,11 @@ def _stripGraphBase(url: str) -> str: def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> ExternalEntry: isFolder = "folder" in item + # Graph exposes the driveItem content hash as ``eTag`` (quoted) or + # ``cTag``; we normalise to a "revision" string so callers can use it as a + # stable ``contentVersion`` for idempotent ingestion without re-downloading + # file bytes. + revision = item.get("eTag") or item.get("cTag") return ExternalEntry( name=item.get("name", ""), path=f"{basePath}/{item.get('name', '')}" if basePath else item.get("name", ""), @@ -137,6 +142,9 @@ def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> Exter "id": item.get("id"), "webUrl": item.get("webUrl"), "childCount": item.get("folder", {}).get("childCount") if isFolder else None, + "revision": revision, + "lastModifiedDateTime": item.get("lastModifiedDateTime"), + "parentReference": item.get("parentReference", {}), }, ) @@ -167,21 +175,36 @@ class SharepointAdapter(_GraphApiMixin, ServiceAdapter): return await self._discoverSites() if not folderPath or folderPath == "/": - endpoint = f"sites/{siteId}/drive/root/children" + endpoint: Optional[str] = f"sites/{siteId}/drive/root/children?$top=200" else: cleanPath = folderPath.lstrip("/") - endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children" + endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children?$top=200" - result = await self._graphGet(endpoint) - if "error" in result: - logger.warning(f"SharePoint browse failed: {result['error']}") - return [] + # Follow @odata.nextLink until a hard cap is reached so large libraries + # are fully enumerated (required for bootstrap). Per-page size uses + # Graph's max supported value to minimise round-trips. + effectiveLimit = int(limit) if limit is not None else None + items: List[Dict[str, Any]] = [] + hardCap = 5000 + while endpoint and len(items) < hardCap: + result = await self._graphGet(endpoint) + if "error" in result: + logger.warning(f"SharePoint browse failed: {result['error']}") + break + for raw in result.get("value", []) or []: + items.append(raw) + if effectiveLimit is not None and len(items) >= effectiveLimit: + break + if effectiveLimit is not None and len(items) >= effectiveLimit: + break + nextLink = result.get("@odata.nextLink") + endpoint = _stripGraphBase(nextLink) if nextLink else None - entries = [_graphItemToExternalEntry(item, path) for item in result.get("value", [])] + entries = [_graphItemToExternalEntry(item, path) for item in items] if filter: entries = [e for e in entries if _matchFilter(e, filter)] - if limit is not None: - entries = entries[: max(1, int(limit))] + if effectiveLimit is not None: + entries = entries[: max(1, effectiveLimit)] return entries async def _discoverSites(self) -> List[ExternalEntry]: diff --git a/modules/datamodels/datamodelExtraction.py b/modules/datamodels/datamodelExtraction.py index 0aaaffd8..38fd1d27 100644 --- a/modules/datamodels/datamodelExtraction.py +++ b/modules/datamodels/datamodelExtraction.py @@ -95,7 +95,14 @@ class ExtractionOptions(BaseModel): imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)") # Merging strategy - mergeStrategy: MergeStrategy = Field(default_factory=MergeStrategy, description="Strategy for merging extraction results") + mergeStrategy: Optional[MergeStrategy] = Field( + default_factory=MergeStrategy, + description=( + "Strategy for merging extraction results. Pass None to skip merging entirely " + "(required for per-chunk ingestion pipelines like RAG, where per-page/per-section " + "granularity must be preserved for embedding)." + ), + ) # Optional chunking parameters (for backward compatibility) chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed") diff --git a/modules/datamodels/datamodelFileFolder.py b/modules/datamodels/datamodelFileFolder.py deleted file mode 100644 index 4829385e..00000000 --- a/modules/datamodels/datamodelFileFolder.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2025 Patrick Motsch -# All rights reserved. -"""FileFolder: hierarchical folder structure for file organization.""" - -from typing import Optional -from pydantic import BaseModel, Field -from modules.datamodels.datamodelBase import PowerOnModel -from modules.shared.i18nRegistry import i18nModel -import uuid - - -@i18nModel("Dateiordner") -class FileFolder(PowerOnModel): - """Hierarchischer Ordner fuer die Dateiverwaltung.""" - id: str = Field( - default_factory=lambda: str(uuid.uuid4()), - description="Primary key", - json_schema_extra={"label": "ID", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False}, - ) - name: str = Field( - description="Folder name", - json_schema_extra={"label": "Name", "frontend_type": "text", "frontend_readonly": False, "frontend_required": True}, - ) - parentId: Optional[str] = Field( - default=None, - description="Parent folder ID (null = root)", - json_schema_extra={ - "label": "Uebergeordneter Ordner", - "frontend_type": "text", - "frontend_readonly": False, - "frontend_required": False, - "fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"}, - }, - ) - mandateId: Optional[str] = Field( - default=None, - description="Mandate context", - json_schema_extra={ - "label": "Mandanten-ID", - "frontend_type": "text", - "frontend_readonly": True, - "frontend_required": False, - "fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"}, - }, - ) - featureInstanceId: Optional[str] = Field( - default=None, - description="Feature instance context", - json_schema_extra={ - "label": "Feature-Instanz-ID", - "frontend_type": "text", - "frontend_readonly": True, - "frontend_required": False, - "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}, - }, - ) - scope: str = Field( - default="personal", - description="Data visibility scope: personal, featureInstance, mandate, global. Inherited by files in this folder.", - json_schema_extra={ - "label": "Sichtbarkeit", - "frontend_type": "select", - "frontend_readonly": False, - "frontend_required": False, - "frontend_options": [ - {"value": "personal", "label": "Persönlich"}, - {"value": "featureInstance", "label": "Feature-Instanz"}, - {"value": "mandate", "label": "Mandant"}, - {"value": "global", "label": "Global"}, - ], - }, - ) - neutralize: bool = Field( - default=False, - description="Whether files in this folder should be neutralized before AI processing. Inherited by new/moved files.", - json_schema_extra={ - "label": "Neutralisieren", - "frontend_type": "checkbox", - "frontend_readonly": False, - "frontend_required": False, - }, - ) diff --git a/modules/datamodels/datamodelFiles.py b/modules/datamodels/datamodelFiles.py index 82628e0c..2a547b9c 100644 --- a/modules/datamodels/datamodelFiles.py +++ b/modules/datamodels/datamodelFiles.py @@ -68,17 +68,6 @@ class FileItem(PowerOnModel): description="Tags for categorization and search", json_schema_extra={"label": "Tags", "frontend_type": "tags", "frontend_readonly": False, "frontend_required": False}, ) - folderId: Optional[str] = Field( - default=None, - description="ID of the parent folder", - json_schema_extra={ - "label": "Ordner-ID", - "frontend_type": "text", - "frontend_readonly": False, - "frontend_required": False, - "fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"}, - }, - ) description: Optional[str] = Field( default=None, description="User-provided description of the file", diff --git a/modules/datamodels/datamodelKnowledge.py b/modules/datamodels/datamodelKnowledge.py index 163328a4..d0af2216 100644 --- a/modules/datamodels/datamodelKnowledge.py +++ b/modules/datamodels/datamodelKnowledge.py @@ -90,6 +90,16 @@ class FileContentIndex(PowerOnModel): description="Data visibility scope: personal, featureInstance, mandate, global", json_schema_extra={"label": "Sichtbarkeit"}, ) + sourceKind: str = Field( + default="file", + description="Origin of the indexed content: file, sharepoint_item, outlook_message, outlook_attachment, ...", + json_schema_extra={"label": "Quellenart"}, + ) + connectionId: Optional[str] = Field( + default=None, + description="UserConnection ID if this index entry originates from an external connector", + json_schema_extra={"label": "Connection-ID"}, + ) neutralizationStatus: Optional[str] = Field( default=None, description="Neutralization status: completed, failed, skipped, None = not required", diff --git a/modules/datamodels/datamodelPagination.py b/modules/datamodels/datamodelPagination.py index 2719327b..7bda7717 100644 --- a/modules/datamodels/datamodelPagination.py +++ b/modules/datamodels/datamodelPagination.py @@ -13,6 +13,42 @@ import math T = TypeVar('T') +# --------------------------------------------------------------------------- +# Table Grouping models +# --------------------------------------------------------------------------- + +class TableGroupNode(BaseModel): + """ + A single node in a user-defined group tree for a FormGeneratorTable. + + Items belong to exactly one group (no multi-membership). + Groups can be nested to arbitrary depth via subGroups. + """ + id: str + name: str + itemIds: List[str] = Field(default_factory=list) + subGroups: List['TableGroupNode'] = Field(default_factory=list) + order: int = 0 + isExpanded: bool = True + +TableGroupNode.model_rebuild() + + +class TableGrouping(BaseModel): + """ + Persisted grouping configuration for one (user, contextKey) pair. + Stored in table_groupings in poweron_app (auto-created). + + contextKey convention: API path without /api/ prefix and without trailing slash. + Examples: "connections", "prompts", "admin/users", "trustee/{instanceId}/documents" + """ + id: str + userId: str + contextKey: str + rootGroups: List[TableGroupNode] = Field(default_factory=list) + updatedAt: Optional[float] = None + + class SortField(BaseModel): """ Single sort field configuration. @@ -24,12 +60,23 @@ class SortField(BaseModel): class PaginationParams(BaseModel): """ Complete pagination state including page, sorting, and filters. + + Grouping extensions (both optional — omit when not using grouping): + groupId — Scope the request to items belonging to this group. + The backend resolves it to an itemIds IN-filter before + applying normal pagination/search/filter logic. + Also applied for mode=ids and mode=filterValues so that + bulk-select and filter-dropdowns respect the group scope. + saveGroupTree — If present the backend persists this tree for the current + (user, contextKey) pair *before* fetching, then returns + the confirmed tree in the response groupTree field. + Omit on every request that does not change the group tree. """ page: int = Field(ge=1, description="Current page number (1-based)") pageSize: int = Field(ge=1, le=1000, description="Number of items per page") sort: List[SortField] = Field(default_factory=list, description="List of sort fields in priority order") filters: Optional[Dict[str, Any]] = Field( - default=None, + default=None, description="""Filter criteria dictionary. Supports: - General search: {"search": "text"} - searches across all text fields (case-insensitive) - Field-specific filters: @@ -38,6 +85,14 @@ class PaginationParams(BaseModel): - Supported operators: equals/eq, contains, startsWith, endsWith, gt, gte, lt, lte, in, notIn - Multiple filters are combined with AND logic""" ) + groupId: Optional[str] = Field( + default=None, + description="Scope request to items of this group (resolved server-side to itemIds IN-filter)", + ) + saveGroupTree: Optional[List[Dict[str, Any]]] = Field( + default=None, + description="If set, persist this group tree before fetching (optimistic save)", + ) class PaginationRequest(BaseModel): @@ -74,10 +129,19 @@ class PaginationMetadata(BaseModel): class PaginatedResponse(BaseModel, Generic[T]): """ Response containing paginated data and metadata. + + groupTree is included when the endpoint supports table grouping and the + current user has a saved group tree for the requested contextKey. + It is None when grouping is not configured for the endpoint or the user + has not created any groups yet. Frontend must treat None as an empty tree. """ items: List[T] = Field(..., description="Array of items for current page") pagination: Optional[PaginationMetadata] = Field(..., description="Pagination metadata (None if pagination not applied)") - + groupTree: Optional[List[TableGroupNode]] = Field( + default=None, + description="Current group tree for this (user, contextKey) pair — None if no grouping configured", + ) + model_config = ConfigDict(arbitrary_types_allowed=True) @@ -85,29 +149,33 @@ def normalize_pagination_dict(pagination_dict: Dict[str, Any]) -> Dict[str, Any] """ Normalize pagination dictionary to handle frontend variations. Moves top-level "search" field into filters if present. - + Grouping fields (groupId, saveGroupTree) are passed through as-is. + Args: pagination_dict: Raw pagination dictionary from frontend - + Returns: Normalized pagination dictionary ready for PaginationParams parsing """ if not pagination_dict: return pagination_dict - + # Create a copy to avoid modifying the original normalized = dict(pagination_dict) - + # Ensure required fields have sensible defaults if "page" not in normalized: normalized["page"] = 1 if "pageSize" not in normalized: normalized["pageSize"] = 25 - + # Move top-level "search" into filters if present if "search" in normalized: if "filters" not in normalized or normalized["filters"] is None: normalized["filters"] = {} normalized["filters"]["search"] = normalized.pop("search") - + + # groupId / saveGroupTree are valid PaginationParams fields — pass through unchanged. + # No transformation needed; Pydantic will validate them. + return normalized diff --git a/modules/datamodels/datamodelUam.py b/modules/datamodels/datamodelUam.py index 0f7fe6b8..6aba24eb 100644 --- a/modules/datamodels/datamodelUam.py +++ b/modules/datamodels/datamodelUam.py @@ -475,7 +475,23 @@ class UserConnection(PowerOnModel): description="OAuth scopes granted for this connection", json_schema_extra={"frontend_type": "list", "frontend_readonly": True, "frontend_required": False, "label": "Gewährte Berechtigungen"}, ) - + knowledgeIngestionEnabled: bool = Field( + default=False, + description="Whether the user has consented to knowledge ingestion for this connection", + json_schema_extra={"frontend_type": "boolean", "frontend_readonly": False, "frontend_required": False, "label": "Wissensdatenbank aktiv"}, + ) + knowledgePreferences: Optional[Dict[str, Any]] = Field( + default=None, + description=( + "Per-connection knowledge ingestion preferences. schemaVersion=1 keys: " + "neutralizeBeforeEmbed (bool), mailContentDepth (metadata|snippet|full), " + "mailIndexAttachments (bool), filesIndexBinaries (bool), mimeAllowlist (list[str]), " + "clickupScope (titles|title_description|with_comments), " + "surfaceToggles (dict per authority), maxAgeDays (int)." + ), + json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False, "label": "Wissenspräferenzen"}, + ) + @computed_field @property def connectionReference(self) -> str: diff --git a/modules/features/commcoach/serviceCommcoachIndexer.py b/modules/features/commcoach/serviceCommcoachIndexer.py index b43764a1..2f042795 100644 --- a/modules/features/commcoach/serviceCommcoachIndexer.py +++ b/modules/features/commcoach/serviceCommcoachIndexer.py @@ -174,14 +174,26 @@ async def indexSessionData( for c in chunks ] - await knowledgeService.indexFile( - fileId=syntheticFileId, - fileName=f"coaching-session-{sessionId[:8]}", - mimeType="application/x-coaching-session", - userId=userId, - featureInstanceId=featureInstanceId, - mandateId=mandateId, - contentObjects=contentObjects, + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="coaching_session", + sourceId=syntheticFileId, + fileName=f"coaching-session-{sessionId[:8]}", + mimeType="application/x-coaching-session", + userId=userId, + featureInstanceId=featureInstanceId, + mandateId=mandateId, + contentObjects=contentObjects, + provenance={ + "lane": "feature", + "feature": "commcoach", + "sessionId": sessionId, + "contextId": contextId, + "messageCount": len(messages or []), + }, + ) ) logger.info(f"Successfully indexed coaching session {sessionId} ({len(chunks)} chunks)") except Exception as e: diff --git a/modules/features/graphicalEditor/portTypes.py b/modules/features/graphicalEditor/portTypes.py index e8d5b48d..f1513f9e 100644 --- a/modules/features/graphicalEditor/portTypes.py +++ b/modules/features/graphicalEditor/portTypes.py @@ -83,7 +83,7 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = { PortField(name="listId", type="str", description="ClickUp-Listen-ID"), PortField(name="name", type="str", required=False, description="Listenname"), PortField(name="spaceId", type="str", required=False, description="Space-ID"), - PortField(name="folderId", type="str", required=False, description="Ordner-ID"), + PortField(name="groupId", type="str", required=False, description="Gruppen-ID für die Gruppierungszuordnung"), PortField(name="connection", type="ConnectionRef", required=False, description="ClickUp-Verbindung"), ]), diff --git a/modules/features/workspace/routeFeatureWorkspace.py b/modules/features/workspace/routeFeatureWorkspace.py index 5b0d4d7a..9595fee4 100644 --- a/modules/features/workspace/routeFeatureWorkspace.py +++ b/modules/features/workspace/routeFeatureWorkspace.py @@ -1208,7 +1208,7 @@ async def patchWorkspaceWorkflowAttachments( # --------------------------------------------------------------------------- -# File and folder list endpoints +# File endpoints # --------------------------------------------------------------------------- @router.get("/{instanceId}/files") @@ -1216,7 +1216,6 @@ async def patchWorkspaceWorkflowAttachments( async def listWorkspaceFiles( request: Request, instanceId: str = Path(...), - folderId: Optional[str] = Query(None), tags: Optional[str] = Query(None), search: Optional[str] = Query(None), context: RequestContext = Depends(getRequestContext), @@ -1271,30 +1270,6 @@ async def getFileContent( return Response(content=content, media_type=mimeType) -@router.get("/{instanceId}/folders") -@limiter.limit("300/minute") -async def listWorkspaceFolders( - request: Request, - instanceId: str = Path(...), - parentId: Optional[str] = Query(None), - context: RequestContext = Depends(getRequestContext), -): - _mandateId, _ = _validateInstanceAccess(instanceId, context) - try: - from modules.serviceCenter import getService - from modules.serviceCenter.context import ServiceCenterContext - ctx = ServiceCenterContext( - user=context.user, - mandate_id=_mandateId or "", - feature_instance_id=instanceId, - ) - chatService = getService("chat", ctx) - folders = chatService.listFolders(parentId=parentId) - return JSONResponse({"folders": folders or []}) - except Exception: - return JSONResponse({"folders": []}) - - @router.get("/{instanceId}/datasources") @limiter.limit("300/minute") async def listWorkspaceDataSources( diff --git a/modules/interfaces/interfaceDbApp.py b/modules/interfaces/interfaceDbApp.py index 51519a29..6f1d9487 100644 --- a/modules/interfaces/interfaceDbApp.py +++ b/modules/interfaces/interfaceDbApp.py @@ -1268,19 +1268,7 @@ class AppObjects: result = [] for conn_dict in connections: try: - # Create UserConnection object - connection = UserConnection( - id=conn_dict["id"], - userId=conn_dict["userId"], - authority=conn_dict.get("authority"), - externalId=conn_dict.get("externalId", ""), - externalUsername=conn_dict.get("externalUsername", ""), - externalEmail=conn_dict.get("externalEmail"), - status=conn_dict.get("status", "pending"), - connectedAt=conn_dict.get("connectedAt"), - lastChecked=conn_dict.get("lastChecked"), - expiresAt=conn_dict.get("expiresAt"), - ) + connection = UserConnection.model_validate(conn_dict) result.append(connection) except Exception as e: logger.error( @@ -1293,6 +1281,28 @@ class AppObjects: logger.error(f"Error getting user connections: {str(e)}") return [] + def getActiveKnowledgeConnections(self) -> List[UserConnection]: + """Return all UserConnections with knowledgeIngestionEnabled=True and status=active. + + Used by the daily re-sync scheduler to determine which connections to re-index. + """ + try: + rows = self.db.getRecordset( + UserConnection, + recordFilter={"knowledgeIngestionEnabled": True, "status": ConnectionStatus.ACTIVE.value}, + ) + result = [] + for row in rows or []: + try: + conn = UserConnection.model_validate(row) if isinstance(row, dict) else row + result.append(conn) + except Exception as _e: + logger.warning(f"getActiveKnowledgeConnections: could not parse row: {_e}") + return result + except Exception as e: + logger.error(f"getActiveKnowledgeConnections failed: {e}") + return [] + def getUserConnectionById(self, connectionId: str) -> Optional[UserConnection]: """Get a single UserConnection by ID or by reference string (connection:authority:username).""" try: @@ -1317,18 +1327,21 @@ class AppObjects: if connections: conn_dict = connections[0] - return UserConnection( - id=conn_dict["id"], - userId=conn_dict["userId"], - authority=conn_dict.get("authority"), - externalId=conn_dict.get("externalId", ""), - externalUsername=conn_dict.get("externalUsername", ""), - externalEmail=conn_dict.get("externalEmail"), - status=conn_dict.get("status", "pending"), - connectedAt=conn_dict.get("connectedAt"), - lastChecked=conn_dict.get("lastChecked"), - expiresAt=conn_dict.get("expiresAt"), - ) + try: + return UserConnection.model_validate(conn_dict) + except Exception: + return UserConnection( + id=conn_dict["id"], + userId=conn_dict["userId"], + authority=conn_dict.get("authority"), + externalId=conn_dict.get("externalId", ""), + externalUsername=conn_dict.get("externalUsername", ""), + externalEmail=conn_dict.get("externalEmail"), + status=conn_dict.get("status", "pending"), + connectedAt=conn_dict.get("connectedAt"), + lastChecked=conn_dict.get("lastChecked"), + expiresAt=conn_dict.get("expiresAt"), + ) return None except Exception as e: logger.error(f"Error getting user connection by ID: {str(e)}") @@ -4014,6 +4027,59 @@ class AppObjects: logger.error(f"Error deleting role {roleId}: {str(e)}") raise + # ------------------------------------------------------------------------- + # Table Grouping (user-defined groups for FormGeneratorTable instances) + # ------------------------------------------------------------------------- + + def getTableGrouping(self, contextKey: str): + """ + Load the group tree for the current user and the given contextKey. + + Returns a TableGrouping instance or None if no grouping has been saved yet. + contextKey identifies the table instance, e.g. "connections", "prompts", + "admin/users", "trustee/{instanceId}/documents". + """ + from modules.datamodels.datamodelPagination import TableGrouping + try: + records = self.db.getRecordset( + TableGrouping, + recordFilter={"userId": str(self.userId), "contextKey": contextKey}, + ) + if not records: + return None + row = records[0] + return TableGrouping.model_validate(row) if isinstance(row, dict) else row + except Exception as e: + logger.error(f"getTableGrouping failed for user={self.userId} key={contextKey}: {e}") + return None + + def upsertTableGrouping(self, contextKey: str, rootGroups: list): + """ + Create or replace the group tree for the current user and contextKey. + + rootGroups is a list of TableGroupNode-compatible dicts (the full tree). + Returns the saved TableGrouping instance. + """ + from modules.datamodels.datamodelPagination import TableGrouping + from modules.shared.timeUtils import getUtcTimestamp + try: + existing = self.getTableGrouping(contextKey) + data = { + "id": existing.id if existing else str(uuid.uuid4()), + "userId": str(self.userId), + "contextKey": contextKey, + "rootGroups": rootGroups, + "updatedAt": getUtcTimestamp(), + } + if existing: + self.db.recordModify(TableGrouping, existing.id, data) + else: + self.db.recordCreate(TableGrouping, data) + return TableGrouping.model_validate(data) + except Exception as e: + logger.error(f"upsertTableGrouping failed for user={self.userId} key={contextKey}: {e}") + raise + # Public Methods diff --git a/modules/interfaces/interfaceDbKnowledge.py b/modules/interfaces/interfaceDbKnowledge.py index 9d6ba3d4..c2f79b67 100644 --- a/modules/interfaces/interfaceDbKnowledge.py +++ b/modules/interfaces/interfaceDbKnowledge.py @@ -93,6 +93,46 @@ class KnowledgeObjects: self.db.recordModify(FileContentIndex, fileId, {"status": status}) return True + def deleteFileContentIndexByConnectionId(self, connectionId: str) -> Dict[str, int]: + """Delete all FileContentIndex rows (and their ContentChunks) for a connection. + + Used when a UserConnection is revoked / disconnected so the knowledge corpus + no longer references data the user no longer grants access to. Returns a dict + with counts to support observability logs. + """ + if not connectionId: + return {"indexRows": 0, "chunks": 0} + + rows = self.db.getRecordset( + FileContentIndex, recordFilter={"connectionId": connectionId} + ) + mandateIds: set = set() + chunkCount = 0 + indexCount = 0 + for row in rows: + fid = row.get("id") if isinstance(row, dict) else getattr(row, "id", None) + mid = row.get("mandateId") if isinstance(row, dict) else getattr(row, "mandateId", "") + if not fid: + continue + chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fid}) + for chunk in chunks: + if self.db.recordDelete(ContentChunk, chunk["id"]): + chunkCount += 1 + if self.db.recordDelete(FileContentIndex, fid): + indexCount += 1 + if mid: + mandateIds.add(str(mid)) + + for mid in mandateIds: + try: + from modules.interfaces.interfaceDbBilling import _getRootInterface + + _getRootInterface().reconcileMandateStorageBilling(mid) + except Exception as ex: + logger.warning("reconcileMandateStorageBilling after connection purge failed: %s", ex) + + return {"indexRows": indexCount, "chunks": chunkCount} + def deleteFileContentIndex(self, fileId: str) -> bool: """Delete a FileContentIndex and all associated ContentChunks.""" existing = self.getFileContentIndex(fileId) diff --git a/modules/interfaces/interfaceDbManagement.py b/modules/interfaces/interfaceDbManagement.py index f72597b3..b263c98b 100644 --- a/modules/interfaces/interfaceDbManagement.py +++ b/modules/interfaces/interfaceDbManagement.py @@ -20,7 +20,6 @@ from modules.security.rbac import RbacClass from modules.datamodels.datamodelRbac import AccessRuleContext from modules.datamodels.datamodelUam import AccessLevel from modules.datamodels.datamodelFiles import FilePreview, FileItem, FileData -from modules.datamodels.datamodelFileFolder import FileFolder from modules.datamodels.datamodelUtils import Prompt from modules.datamodels.datamodelMessaging import ( MessagingSubscription, @@ -1103,15 +1102,12 @@ class ComponentObjects: return newfileName counter += 1 - def createFile(self, name: str, mimeType: str, content: bytes, folderId: Optional[str] = None) -> FileItem: + def createFile(self, name: str, mimeType: str, content: bytes) -> FileItem: """Creates a new file entry if user has permission. Computes fileHash and fileSize from content. Duplicate check: if a file with the same user + fileHash + fileName already exists, the existing file is returned instead of creating a new one. Same hash with different name is allowed (intentional copy by user). - - Args: - folderId: Optional parent folder ID. None/empty means the root folder. """ if not self.checkRbacPermission(FileItem, "create"): raise PermissionError("No permission to create files") @@ -1139,11 +1135,6 @@ class ComponentObjects: else: scope = "personal" - # Normalize folderId: treat empty string as "no folder" (= root) – NULL in DB - normalizedFolderId: Optional[str] = folderId - if isinstance(normalizedFolderId, str) and not normalizedFolderId.strip(): - normalizedFolderId = None - fileItem = FileItem( mandateId=mandateId, featureInstanceId=featureInstanceId, @@ -1152,7 +1143,6 @@ class ComponentObjects: mimeType=mimeType, fileSize=fileSize, fileHash=fileHash, - folderId=normalizedFolderId, ) # Store in database @@ -1277,382 +1267,47 @@ class ComponentObjects: self.db.connection.rollback() raise FileDeletionError(f"Error deleting files in batch: {str(e)}") - # ---- Folder methods ---- - - _RESERVED_FOLDER_NAMES = {"(Global)"} - - def _validateFolderName(self, name: str, parentId: Optional[str], excludeFolderId: Optional[str] = None): - """Ensures folder name is not reserved and is unique within parent.""" - if name in self._RESERVED_FOLDER_NAMES: - raise ValueError(f"Folder name '{name}' is reserved") - if not name or not name.strip(): - raise ValueError("Folder name cannot be empty") - existingFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": parentId or ""}) - for f in existingFolders: - if f.get("name") == name and f.get("id") != excludeFolderId: - raise ValueError(f"Folder '{name}' already exists in this directory") - - def _isDescendantOf(self, folderId: str, ancestorId: str) -> bool: - """Checks if folderId is a descendant of ancestorId (circular reference check).""" - visited = set() - currentId = folderId - while currentId: - if currentId == ancestorId: - return True - if currentId in visited: - break - visited.add(currentId) - folders = self.db.getRecordset(FileFolder, recordFilter={"id": currentId}) - if not folders: - break - currentId = folders[0].get("parentId") - return False - - def _ensureFeatureInstanceFolder(self, featureInstanceId: str, mandateId: str = "") -> Optional[str]: - """Return the folder ID for a feature instance, creating it on first use. - The folder is named after the feature instance label.""" - existing = self.db.getRecordset( - FileFolder, - recordFilter={ - "featureInstanceId": featureInstanceId, - "sysCreatedBy": self.userId or "", - }, - ) - if existing: - return existing[0].get("id") - - # Resolve the instance label for the folder name - folderName = featureInstanceId[:8] + def _ensureFeatureInstanceGroup(self, featureInstanceId: str, contextKey: str = "files/list") -> Optional[str]: + """Return the groupId of the default group for a feature instance. + Creates the group if it doesn't exist yet.""" try: - from modules.datamodels.datamodelFeatures import FeatureInstance - from modules.security.rootAccess import getRootDbAppConnector - dbApp = getRootDbAppConnector() - instances = dbApp.getRecordset(FeatureInstance, recordFilter={"id": featureInstanceId}) - if instances: - folderName = instances[0].get("label") or folderName + import modules.interfaces.interfaceDbApp as _appIface + appInterface = _appIface.getInterface(self._currentUser) + existing = appInterface.getTableGrouping(contextKey) + nodes = [n.model_dump() if hasattr(n, 'model_dump') else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])] + # Look for group with name matching featureInstanceId + def _find(nds): + for nd in nds: + nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + nmeta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {}) + if (nmeta or {}).get("featureInstanceId") == featureInstanceId: + return nid + subs = nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []) + result = _find(subs) + if result: + return result + return None + found = _find(nodes) + if found: + return found + # Create new group + import uuid + newId = str(uuid.uuid4()) + newGroup = { + "id": newId, + "name": featureInstanceId, + "itemIds": [], + "subGroups": [], + "meta": {"featureInstanceId": featureInstanceId}, + } + nodes.append(newGroup) + appInterface.upsertTableGrouping(contextKey, nodes) + return newId except Exception as e: - logger.warning(f"Could not resolve feature instance label: {e}") + logger.error(f"_ensureFeatureInstanceGroup failed: {e}") + return None - folder = FileFolder( - name=folderName, - parentId=None, - mandateId=mandateId, - featureInstanceId=featureInstanceId, - ) - created = self.db.recordCreate(FileFolder, folder) - return created.get("id") if isinstance(created, dict) else getattr(created, "id", None) - - def getFolder(self, folderId: str) -> Optional[Dict[str, Any]]: - """Returns a folder by ID if it belongs to the current user.""" - folders = self.db.getRecordset(FileFolder, recordFilter={"id": folderId, "sysCreatedBy": self.userId or ""}) - return folders[0] if folders else None - - def listFolders(self, parentId: Optional[str] = None) -> List[Dict[str, Any]]: - """List folders visible to the current user. - Own folders are always returned. Other users' folders are only - returned when they contain files visible to the current user. - Each folder is enriched with ``fileCount``.""" - recordFilter = {} - if parentId is not None: - recordFilter["parentId"] = parentId - folders = self.db.getRecordset(FileFolder, recordFilter=recordFilter if recordFilter else None) - - if not folders: - return folders - - folderIds = [f["id"] for f in folders if f.get("id")] - fileCounts: Dict[str, int] = {} - try: - from modules.interfaces.interfaceRbac import buildFilesScopeWhereClause - scopeClause = buildFilesScopeWhereClause( - self.currentUser, "FileItem", self.db, - self.mandateId, self.featureInstanceId, - [], [], - ) - - self.db._ensure_connection() - with self.db.connection.cursor() as cursor: - baseQuery = ( - 'SELECT "folderId", COUNT(*) AS cnt ' - 'FROM "FileItem" ' - 'WHERE "folderId" = ANY(%s)' - ) - queryValues: list = [folderIds] - - if scopeClause: - baseQuery += ' AND (' + scopeClause["condition"] + ')' - queryValues.extend(scopeClause["values"]) - - baseQuery += ' GROUP BY "folderId"' - cursor.execute(baseQuery, queryValues) - for row in cursor.fetchall(): - fileCounts[row["folderId"]] = row["cnt"] - except Exception as e: - logger.warning(f"Could not count files per folder: {e}") - - userId = self.userId or "" - result = [] - for folder in folders: - fc = fileCounts.get(folder.get("id", ""), 0) - folder["fileCount"] = fc - isOwn = folder.get("sysCreatedBy") == userId - if isOwn or fc > 0: - result.append(folder) - - return result - - def createFolder(self, name: str, parentId: Optional[str] = None) -> Dict[str, Any]: - """Create a new folder with unique name validation.""" - self._validateFolderName(name, parentId) - folder = FileFolder( - name=name, - parentId=parentId, - mandateId=self.mandateId or "", - featureInstanceId=self.featureInstanceId or "", - ) - return self.db.recordCreate(FileFolder, folder) - - def renameFolder(self, folderId: str, newName: str) -> bool: - """Rename a folder with unique name validation.""" - folder = self.getFolder(folderId) - if not folder: - raise FileNotFoundError(f"Folder {folderId} not found") - self._validateFolderName(newName, folder.get("parentId"), excludeFolderId=folderId) - return self.db.recordModify(FileFolder, folderId, {"name": newName}) - - def updateFolder(self, folderId: str, updateData: Dict[str, Any]) -> bool: - """ - Update folder metadata (e.g. ``scope``, ``neutralize``). Owner-only, - same access model as renameFolder/moveFolder. Use ``renameFolder`` for - ``name`` changes (uniqueness validation) and ``moveFolder`` for - ``parentId`` changes (cycle/uniqueness validation). - """ - if not updateData: - return True - folder = self.getFolder(folderId) - if not folder: - raise FileNotFoundError(f"Folder {folderId} not found") - forbiddenKeys = {"id", "sysCreatedBy", "sysCreatedAt", "sysUpdatedAt"} - cleaned: Dict[str, Any] = {k: v for k, v in updateData.items() if k not in forbiddenKeys} - if "name" in cleaned: - self._validateFolderName(cleaned["name"], folder.get("parentId"), excludeFolderId=folderId) - return self.db.recordModify(FileFolder, folderId, cleaned) - - def moveFolder(self, folderId: str, targetParentId: Optional[str] = None) -> bool: - """Move a folder to a new parent, with circular reference and unique name checks.""" - folder = self.getFolder(folderId) - if not folder: - raise FileNotFoundError(f"Folder {folderId} not found") - if targetParentId and self._isDescendantOf(targetParentId, folderId): - raise ValueError("Cannot move folder into its own subtree") - self._validateFolderName(folder.get("name", ""), targetParentId, excludeFolderId=folderId) - return self.db.recordModify(FileFolder, folderId, {"parentId": targetParentId}) - - def moveFilesBatch(self, fileIds: List[str], targetFolderId: Optional[str] = None) -> Dict[str, Any]: - """Move multiple files with one SQL update. - Owner can always move; non-owners need RBAC ALL level.""" - uniqueIds = [str(fid) for fid in dict.fromkeys(fileIds or []) if fid] - if not uniqueIds: - return {"movedFiles": 0} - - if targetFolderId: - targetFolder = self.getFolder(targetFolderId) - if not targetFolder: - raise FileNotFoundError(f"Target folder {targetFolderId} not found") - - try: - self.db._ensure_connection() - with self.db.connection.cursor() as cursor: - cursor.execute( - 'SELECT "id", "sysCreatedBy" FROM "FileItem" WHERE "id" = ANY(%s)', - (uniqueIds,), - ) - rows = cursor.fetchall() - foundIds = {row["id"] for row in rows} - missing = sorted(set(uniqueIds) - foundIds) - if missing: - raise FileNotFoundError(f"Files not found: {missing}") - - for row in rows: - self._requireFileWriteAccess(row, row["id"], "update") - - accessibleIds = [row["id"] for row in rows] - cursor.execute( - 'UPDATE "FileItem" SET "folderId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s ' - 'WHERE "id" = ANY(%s)', - (targetFolderId, getUtcTimestamp(), self.userId or "", accessibleIds), - ) - movedFiles = cursor.rowcount - - self.db.connection.commit() - return {"movedFiles": movedFiles} - except Exception as e: - logger.error(f"Error moving files in batch: {e}") - self.db.connection.rollback() - raise FileError(f"Error moving files in batch: {str(e)}") - - def moveFoldersBatch(self, folderIds: List[str], targetParentId: Optional[str] = None) -> Dict[str, Any]: - """Move multiple folders with one SQL update after validation.""" - uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid] - if not uniqueIds: - return {"movedFolders": 0} - - foldersToMove: List[Dict[str, Any]] = [] - for folderId in uniqueIds: - folder = self.getFolder(folderId) - if not folder: - raise FileNotFoundError(f"Folder {folderId} not found") - if targetParentId and self._isDescendantOf(targetParentId, folderId): - raise ValueError("Cannot move folder into its own subtree") - foldersToMove.append(folder) - - existingInTarget = self.db.getRecordset( - FileFolder, - recordFilter={"parentId": targetParentId or "", "sysCreatedBy": self.userId or ""}, - ) - existingNames = {f.get("name"): f.get("id") for f in existingInTarget} - movingNames: Dict[str, str] = {} - movingIds = set(uniqueIds) - - for folder in foldersToMove: - name = folder.get("name", "") - folderId = folder.get("id") - if name in movingNames and movingNames[name] != folderId: - raise ValueError(f"Folder '{name}' already exists in this move batch") - movingNames[name] = folderId - - existingId = existingNames.get(name) - if existingId and existingId not in movingIds: - raise ValueError(f"Folder '{name}' already exists in target directory") - - try: - self.db._ensure_connection() - with self.db.connection.cursor() as cursor: - cursor.execute( - 'UPDATE "FileFolder" SET "parentId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s ' - 'WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s', - (targetParentId, getUtcTimestamp(), self.userId or "", uniqueIds, self.userId or ""), - ) - movedFolders = cursor.rowcount - - self.db.connection.commit() - return {"movedFolders": movedFolders} - except Exception as e: - logger.error(f"Error moving folders in batch: {e}") - self.db.connection.rollback() - raise FileError(f"Error moving folders in batch: {str(e)}") - - def deleteFolder(self, folderId: str, recursive: bool = False) -> Dict[str, Any]: - """Delete a folder. If recursive, deletes all contents. Returns summary of deletions.""" - folder = self.getFolder(folderId) - if not folder: - raise FileNotFoundError(f"Folder {folderId} not found") - - childFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": folderId, "sysCreatedBy": self.userId or ""}) - childFiles = self._getFilesByCurrentUser(recordFilter={"folderId": folderId}) - - if not recursive and (childFolders or childFiles): - raise ValueError( - f"Folder '{folder.get('name')}' is not empty " - f"({len(childFiles)} files, {len(childFolders)} subfolders). " - f"Use recursive=true to delete contents." - ) - - deletedFiles = 0 - deletedFolders = 0 - - if recursive: - for subFolder in childFolders: - subResult = self.deleteFolder(subFolder["id"], recursive=True) - deletedFiles += subResult.get("deletedFiles", 0) - deletedFolders += subResult.get("deletedFolders", 0) - for childFile in childFiles: - try: - self.deleteFile(childFile["id"]) - deletedFiles += 1 - except Exception as e: - logger.warning(f"Failed to delete file {childFile['id']} during folder deletion: {e}") - - self.db.recordDelete(FileFolder, folderId) - deletedFolders += 1 - - return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders} - - def deleteFoldersBatch(self, folderIds: List[str], recursive: bool = True) -> Dict[str, Any]: - """Delete multiple folders and their content in batched SQL calls.""" - uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid] - if not uniqueIds: - return {"deletedFiles": 0, "deletedFolders": 0} - - if not recursive: - deletedFiles = 0 - deletedFolders = 0 - for folderId in uniqueIds: - result = self.deleteFolder(folderId, recursive=False) - deletedFiles += result.get("deletedFiles", 0) - deletedFolders += result.get("deletedFolders", 0) - return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders} - - try: - self.db._ensure_connection() - with self.db.connection.cursor() as cursor: - cursor.execute( - 'SELECT "id" FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s', - (uniqueIds, self.userId or ""), - ) - rootAccessibleIds = [row["id"] for row in cursor.fetchall()] - if len(rootAccessibleIds) != len(uniqueIds): - missingIds = sorted(set(uniqueIds) - set(rootAccessibleIds)) - raise FileNotFoundError(f"Folders not found or not accessible: {missingIds}") - - cursor.execute( - """ - WITH RECURSIVE folder_tree AS ( - SELECT "id" - FROM "FileFolder" - WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s - UNION ALL - SELECT child."id" - FROM "FileFolder" child - INNER JOIN folder_tree ft ON child."parentId" = ft."id" - WHERE child."sysCreatedBy" = %s - ) - SELECT DISTINCT "id" FROM folder_tree - """, - (rootAccessibleIds, self.userId or "", self.userId or ""), - ) - allFolderIds = [row["id"] for row in cursor.fetchall()] - - cursor.execute( - 'SELECT "id" FROM "FileItem" WHERE "folderId" = ANY(%s) AND "sysCreatedBy" = %s', - (allFolderIds, self.userId or ""), - ) - allFileIds = [row["id"] for row in cursor.fetchall()] - - if allFileIds: - cursor.execute('DELETE FROM "FileData" WHERE "id" = ANY(%s)', (allFileIds,)) - cursor.execute( - 'DELETE FROM "FileItem" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s', - (allFileIds, self.userId or ""), - ) - deletedFiles = cursor.rowcount - else: - deletedFiles = 0 - - cursor.execute( - 'DELETE FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s', - (allFolderIds, self.userId or ""), - ) - deletedFolders = cursor.rowcount - - self.db.connection.commit() - return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders} - except Exception as e: - logger.error(f"Error deleting folders in batch: {e}") - self.db.connection.rollback() - raise FileDeletionError(f"Error deleting folders in batch: {str(e)}") - - def copyFile(self, sourceFileId: str, targetFolderId: Optional[str] = None, newFileName: Optional[str] = None) -> FileItem: + def copyFile(self, sourceFileId: str, newFileName: Optional[str] = None) -> FileItem: """Create a full duplicate of a file (FileItem + FileData).""" sourceFile = self.getFile(sourceFileId) if not sourceFile: @@ -1665,11 +1320,6 @@ class ComponentObjects: fileName = newFileName or sourceFile.fileName copiedFile = self.createFile(fileName, sourceFile.mimeType, sourceData) - if targetFolderId: - self.updateFile(copiedFile.id, {"folderId": targetFolderId}) - elif sourceFile.folderId: - self.updateFile(copiedFile.id, {"folderId": sourceFile.folderId}) - self.createFileData(copiedFile.id, sourceData) return copiedFile @@ -1884,18 +1534,14 @@ class ComponentObjects: logger.error(f"Error getting file content: {str(e)}") return None - def saveUploadedFile(self, fileContent: bytes, fileName: str, folderId: Optional[str] = None) -> tuple[FileItem, str]: - """Saves an uploaded file if user has permission. - - Args: - folderId: Optional parent folder ID. None means root folder. - """ + def saveUploadedFile(self, fileContent: bytes, fileName: str) -> tuple[FileItem, str]: + """Saves an uploaded file if user has permission.""" try: # Check file creation permission if not self.checkRbacPermission(FileItem, "create"): raise PermissionError("No permission to upload files") - logger.debug(f"Starting upload process for file: {fileName} (folderId={folderId!r})") + logger.debug(f"Starting upload process for file: {fileName}") if not isinstance(fileContent, bytes): logger.error(f"Invalid fileContent type: {type(fileContent)}") @@ -1921,7 +1567,6 @@ class ComponentObjects: name=fileName, mimeType=mimeType, content=fileContent, - folderId=folderId, ) # Save binary data diff --git a/modules/interfaces/interfaceRbac.py b/modules/interfaces/interfaceRbac.py index ad2ac6b5..8ecc51fd 100644 --- a/modules/interfaces/interfaceRbac.py +++ b/modules/interfaces/interfaceRbac.py @@ -204,7 +204,6 @@ TABLE_NAMESPACE = { # Files - benutzer-eigen "FileItem": "files", "FileData": "files", - "FileFolder": "files", # Automation - benutzer-eigen "AutomationDefinition": "automation", "AutomationTemplate": "automation", @@ -529,8 +528,7 @@ def getRecordsetPaginatedWithRBAC( if val is None: # val=None in pagination.filters means "match empty/null" # (same convention as connectorDbPostgre._buildPaginationClauses). - # Covers both historical empty-string values and true NULLs - # e.g. root-folder files where folderId may be "" or NULL. + # Covers both historical empty-string values and true NULLs. whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')') continue if isinstance(val, dict): @@ -689,8 +687,7 @@ def getDistinctColumnValuesWithRBAC( if val is None: # val=None in pagination.filters means "match empty/null" # (same convention as connectorDbPostgre._buildPaginationClauses). - # Covers both historical empty-string values and true NULLs - # e.g. root-folder files where folderId may be "" or NULL. + # Covers both historical empty-string values and true NULLs. whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')') continue if isinstance(val, dict): diff --git a/modules/migrations/__init__.py b/modules/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/migrations/migrate_folders_to_groups.py b/modules/migrations/migrate_folders_to_groups.py new file mode 100644 index 00000000..870e1e45 --- /dev/null +++ b/modules/migrations/migrate_folders_to_groups.py @@ -0,0 +1,240 @@ +""" +One-time migration: Convert FileFolder tree + FileItem.folderId → table_groupings. + +Run this BEFORE dropping the physical FileFolder table and FileItem.folderId column +from the database (those are separate Alembic/SQL steps). + +Usage: + python -m modules.migrations.migrate_folders_to_groups [--dry-run] [--verbose] + +Steps: + 1. For each distinct (userId, mandateId) combination that has FileFolder records: + a. Build the full folder tree (recursive) + b. Write it as a TableGroupNode tree into table_groupings (contextKey='files/list') + – merges with any existing groups rather than overwriting + c. For each FileItem with a folderId that maps into this tree, + add its id to the matching group's itemIds + 2. Print a summary (rows migrated, groups created, files assigned) + 3. If not --dry-run: commits the inserts/updates + NOTE: Schema changes (ALTER TABLE DROP COLUMN, DROP TABLE) are intentionally + NOT performed by this script. Run the corresponding Alembic migration + (migrations/versions/xxxx_drop_folder_columns.py) afterwards. +""" + +import argparse +import json +import logging +import uuid +from typing import Optional + +logger = logging.getLogger(__name__) + + +# ── Helpers ────────────────────────────────────────────────────────────────── + +def _build_tree(folders: list, parent_id: Optional[str]) -> list: + """Recursively build TableGroupNode-compatible dicts from a flat folder list.""" + children = [f for f in folders if f.get("parentId") == parent_id] + result = [] + for folder in children: + node = { + "id": str(uuid.uuid4()), + "name": folder["name"], + "itemIds": [], + "subGroups": _build_tree(folders, folder["id"]), + "meta": {"migratedFromFolderId": folder["id"]}, + } + result.append(node) + return result + + +def _assign_files_to_nodes(nodes: list, files_by_folder: dict) -> list: + """Recursively assign file IDs to group nodes based on folder mapping.""" + for node in nodes: + folder_id = (node.get("meta") or {}).get("migratedFromFolderId") + if folder_id and folder_id in files_by_folder: + node["itemIds"] = list(files_by_folder[folder_id]) + node["subGroups"] = _assign_files_to_nodes(node.get("subGroups", []), files_by_folder) + return nodes + + +def _count_items(nodes: list) -> int: + total = 0 + for node in nodes: + total += len(node.get("itemIds", [])) + total += _count_items(node.get("subGroups", [])) + return total + + +def _now_ts() -> str: + from modules.shared.timeUtils import getUtcTimestamp + return getUtcTimestamp() + + +# ── Main migration ──────────────────────────────────────────────────────────── + +def run_migration(dry_run: bool = True, verbose: bool = False): + """Main migration entry point.""" + logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) + logger.info(f"Starting folder→group migration (dry_run={dry_run})") + + from modules.connectors.connectorDbPostgre import getCachedConnector + + connector = getCachedConnector() + if not connector or not connector.connection: + logger.error("Could not obtain a DB connection. Aborting.") + return + + conn = connector.connection + cur = conn.cursor() + + # ── 1. Check that the source tables still exist ─────────────────────────── + cur.execute(""" + SELECT EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_name = 'FileFolder' + ) + """) + folder_table_exists = cur.fetchone()[0] + + cur.execute(""" + SELECT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'FileItem' AND column_name = 'folderId' + ) + """) + folder_column_exists = cur.fetchone()[0] + + if not folder_table_exists and not folder_column_exists: + logger.info("FileFolder table and FileItem.folderId column not found — migration already applied or not needed.") + return + + if not folder_table_exists: + logger.warning("FileFolder table missing but FileItem.folderId column still present. Only file assignments will be migrated.") + if not folder_column_exists: + logger.warning("FileItem.folderId column missing but FileFolder table still present. Only group tree structure will be migrated.") + + # ── 2. Load all folders ─────────────────────────────────────────────────── + folders_by_user: dict = {} + if folder_table_exists: + cur.execute('SELECT "id", "name", "parentId", "sysCreatedBy", "mandateId" FROM "FileFolder"') + for row in cur.fetchall(): + fid, fname, parent_id, user_id, mandate_id = row + key = (str(user_id), str(mandate_id) if mandate_id else "") + folders_by_user.setdefault(key, []).append({ + "id": fid, "name": fname, "parentId": parent_id, + }) + logger.info(f"Loaded folders for {len(folders_by_user)} (user, mandate) combinations") + + # ── 3. Load file→folder assignments ────────────────────────────────────── + files_by_key: dict = {} + if folder_column_exists: + cur.execute( + 'SELECT "id", "folderId", "sysCreatedBy", "mandateId" FROM "FileItem" WHERE "folderId" IS NOT NULL AND "folderId" != \'\'' + ) + for row in cur.fetchall(): + file_id, folder_id, user_id, mandate_id = row + key = (str(user_id), str(mandate_id) if mandate_id else "") + files_by_key.setdefault(key, {}).setdefault(folder_id, []).append(file_id) + total_files = sum( + sum(len(v) for v in d.values()) for d in files_by_key.values() + ) + logger.info(f"Found {total_files} file→folder assignments across {len(files_by_key)} (user, mandate) combos") + + # ── 4. Combine and upsert groupings ────────────────────────────────────── + all_keys = set(folders_by_user.keys()) | set(files_by_key.keys()) + stats = {"groups_created": 0, "groupings_upserted": 0, "files_assigned": 0} + + for key in all_keys: + user_id, mandate_id = key + folders = folders_by_user.get(key, []) + files_by_folder = files_by_key.get(key, {}) + + # Build tree + roots = _build_tree(folders, None) + roots = _assign_files_to_nodes(roots, files_by_folder) + + # Handle files in unknown folders (folder no longer in tree) + known_folder_ids = {f["id"] for f in folders} + for folder_id, file_ids in files_by_folder.items(): + if folder_id not in known_folder_ids: + # Orphaned files: put them in an "Orphaned" group + roots.append({ + "id": str(uuid.uuid4()), + "name": f"Orphaned (folder {folder_id[:8]}…)", + "itemIds": file_ids, + "subGroups": [], + "meta": {"migratedFromFolderId": folder_id, "orphaned": True}, + }) + + if not roots: + continue + + n_items = _count_items(roots) + stats["groups_created"] += len(roots) + stats["files_assigned"] += n_items + + context_key = "files/list" + if verbose: + logger.debug(f" user={user_id} mandate={mandate_id}: {len(roots)} root groups, {n_items} files") + + if not dry_run: + # Check for existing grouping + cur.execute( + 'SELECT "id", "rootGroups" FROM "TableGrouping" WHERE "userId" = %s AND "contextKey" = %s', + (user_id, context_key), + ) + existing_row = cur.fetchone() + + if existing_row: + existing_id, existing_raw = existing_row + existing_roots = json.loads(existing_raw) if isinstance(existing_raw, str) else (existing_raw or []) + # Merge: append migrated groups (avoid duplicates by migratedFromFolderId) + existing_meta_ids = { + (n.get("meta") or {}).get("migratedFromFolderId") + for n in existing_roots + if (n.get("meta") or {}).get("migratedFromFolderId") + } + new_roots = existing_roots + [ + r for r in roots + if (r.get("meta") or {}).get("migratedFromFolderId") not in existing_meta_ids + ] + cur.execute( + 'UPDATE "TableGrouping" SET "rootGroups" = %s, "updatedAt" = %s WHERE "id" = %s', + (json.dumps(new_roots), _now_ts(), existing_id), + ) + else: + new_id = str(uuid.uuid4()) + cur.execute( + 'INSERT INTO "TableGrouping" ("id", "userId", "contextKey", "rootGroups", "updatedAt") VALUES (%s, %s, %s, %s, %s)', + (new_id, user_id, context_key, json.dumps(roots), _now_ts()), + ) + stats["groupings_upserted"] += 1 + + # ── 5. Summary ──────────────────────────────────────────────────────────── + if not dry_run: + conn.commit() + logger.info("Migration committed.") + else: + logger.info("DRY RUN — no changes written.") + + logger.info( + f"Summary: groupings_upserted={stats['groupings_upserted']}, " + f"groups_created={stats['groups_created']}, " + f"files_assigned={stats['files_assigned']}" + ) + logger.info( + "Next steps (run after verifying data):\n" + " 1. Run Alembic migration to DROP COLUMN FileItem.folderId\n" + " 2. Run Alembic migration to DROP TABLE FileFolder" + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Migrate FileFolder tree to table_groupings") + parser.add_argument("--dry-run", action="store_true", default=True, help="Preview only, no DB writes (default)") + parser.add_argument("--execute", action="store_true", help="Actually write to DB (disables dry-run)") + parser.add_argument("--verbose", action="store_true", help="Show per-user details") + args = parser.parse_args() + dry_run = not args.execute + run_migration(dry_run=dry_run, verbose=args.verbose) diff --git a/modules/routes/routeDataConnections.py b/modules/routes/routeDataConnections.py index 8e7a730d..124d2fb4 100644 --- a/modules/routes/routeDataConnections.py +++ b/modules/routes/routeDataConnections.py @@ -152,10 +152,28 @@ async def get_connections( - GET /api/connections/?mode=filterValues&column=status - GET /api/connections/?mode=ids """ - from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels + from modules.routes.routeHelpers import ( + handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels, + handleGroupingInRequest, applyGroupScopeFilter, + ) + + CONTEXT_KEY = "connections" + + # Parse pagination params early — needed for grouping in all modes + paginationParams = None + if pagination: + try: + paginationDict = json.loads(pagination) + if paginationDict: + paginationDict = normalize_pagination_dict(paginationDict) + paginationParams = PaginationParams(**paginationDict) + except (json.JSONDecodeError, ValueError) as e: + raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}") + + interface = getInterface(currentUser) + groupCtx = handleGroupingInRequest(paginationParams, interface, CONTEXT_KEY) def _buildEnhancedItems(): - interface = getInterface(currentUser) connections = interface.getUserConnections(currentUser.id) items = [] for connection in connections: @@ -182,6 +200,7 @@ async def get_connections( try: items = _buildEnhancedItems() enrichRowsWithFkLabels(items, UserConnection) + items = applyGroupScopeFilter(items, groupCtx.itemIds) return handleFilterValuesInMemory(items, column, pagination) except Exception as e: logger.error(f"Error getting filter values for connections: {str(e)}") @@ -189,63 +208,40 @@ async def get_connections( if mode == "ids": try: - return handleIdsInMemory(_buildEnhancedItems(), pagination) + items = applyGroupScopeFilter(_buildEnhancedItems(), groupCtx.itemIds) + return handleIdsInMemory(items, pagination) except Exception as e: logger.error(f"Error getting IDs for connections: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) try: - interface = getInterface(currentUser) - # NOTE: Cannot use db.getRecordsetPaginated() here because each connection # is enriched with computed tokenStatus/tokenExpiresAt (requires per-row DB lookup). # Token refresh also may trigger re-fetch. Connections per user are typically < 10, # so in-memory pagination is acceptable. - - # Parse pagination parameter - paginationParams = None - if pagination: - try: - paginationDict = json.loads(pagination) - if paginationDict: - # Normalize pagination dict (handles top-level "search" field) - paginationDict = normalize_pagination_dict(paginationDict) - paginationParams = PaginationParams(**paginationDict) - except (json.JSONDecodeError, ValueError) as e: - raise HTTPException( - status_code=400, - detail=f"Invalid pagination parameter: {str(e)}" - ) - + # SECURITY FIX: All users (including admins) can only see their own connections - # This prevents admin from seeing other users' connections and causing confusion connections = interface.getUserConnections(currentUser.id) - + # Perform silent token refresh for expired OAuth connections try: refresh_result = await token_refresh_service.refresh_expired_tokens(currentUser.id) if refresh_result.get("refreshed", 0) > 0: logger.info(f"Silently refreshed {refresh_result['refreshed']} tokens for user {currentUser.id}") - # Re-fetch connections to get updated token status connections = interface.getUserConnections(currentUser.id) except Exception as e: logger.warning(f"Silent token refresh failed for user {currentUser.id}: {str(e)}") - # Continue with original connections even if refresh fails - - # Enhance each connection with token status information and convert to dict + enhanced_connections_dict = [] for connection in connections: - # Get token status for this connection tokenStatus, tokenExpiresAt = getTokenStatusForConnection(interface, connection.id) - - # Convert to dict for filtering/sorting connection_dict = { "id": connection.id, "userId": connection.userId, "authority": connection.authority.value if hasattr(connection.authority, 'value') else str(connection.authority), "externalId": connection.externalId, "externalUsername": connection.externalUsername or "", - "externalEmail": connection.externalEmail, # Keep None instead of converting to empty string + "externalEmail": connection.externalEmail, "status": connection.status.value if hasattr(connection.status, 'value') else str(connection.status), "connectedAt": connection.connectedAt, "lastChecked": connection.lastChecked, @@ -254,24 +250,26 @@ async def get_connections( "tokenExpiresAt": tokenExpiresAt } enhanced_connections_dict.append(connection_dict) - + enrichRowsWithFkLabels(enhanced_connections_dict, UserConnection) + enhanced_connections_dict = applyGroupScopeFilter(enhanced_connections_dict, groupCtx.itemIds) if paginationParams is None: return { "items": enhanced_connections_dict, "pagination": None, + "groupTree": groupCtx.groupTree, } - + # Apply filtering if provided if paginationParams.filters: component_interface = ComponentObjects() component_interface.setUserContext(currentUser) enhanced_connections_dict = component_interface._applyFilters( - enhanced_connections_dict, + enhanced_connections_dict, paginationParams.filters ) - + # Apply sorting if provided if paginationParams.sort: component_interface = ComponentObjects() @@ -280,14 +278,14 @@ async def get_connections( enhanced_connections_dict, paginationParams.sort ) - + totalItems = len(enhanced_connections_dict) totalPages = math.ceil(totalItems / paginationParams.pageSize) if totalItems > 0 else 0 - + startIdx = (paginationParams.page - 1) * paginationParams.pageSize endIdx = startIdx + paginationParams.pageSize paged_connections = enhanced_connections_dict[startIdx:endIdx] - + return { "items": paged_connections, "pagination": PaginationMetadata( @@ -298,6 +296,7 @@ async def get_connections( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": groupCtx.groupTree, } except HTTPException: @@ -351,11 +350,18 @@ def create_connection( externalUsername="", # Will be set after OAuth status=ConnectionStatus.PENDING # Start with PENDING status ) - + + # Apply knowledge consent + preferences from request body before persisting + knowledge_enabled = connection_data.get("knowledgeIngestionEnabled") + if isinstance(knowledge_enabled, bool): + connection.knowledgeIngestionEnabled = knowledge_enabled + knowledge_prefs = connection_data.get("knowledgePreferences") + if isinstance(knowledge_prefs, dict): + connection.knowledgePreferences = knowledge_prefs + # Save connection record - models now handle timestamp serialization automatically interface.db.recordModify(UserConnection, connection.id, connection.model_dump()) - - + return connection except HTTPException: @@ -586,8 +592,25 @@ def disconnect_service( detail=routeApiMsg("Connection not found") ) - # Update connection status - connection.status = ConnectionStatus.INACTIVE + # Fire revoked event BEFORE DB status change so knowledge purge and + # status mutation form one logical step; subscribers see the + # connection as it was. INACTIVE does not exist on the enum — REVOKED + # is the correct terminal-but-retained state (deleted rows are + # handled in DELETE /{id}). + try: + from modules.shared.callbackRegistry import callbackRegistry + + callbackRegistry.trigger( + "connection.revoked", + connectionId=connectionId, + authority=str(getattr(connection.authority, "value", connection.authority) or ""), + userId=str(currentUser.id), + reason="disconnected", + ) + except Exception as _cbErr: + logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr) + + connection.status = ConnectionStatus.REVOKED connection.lastChecked = getUtcTimestamp() # Update connection record - models now handle timestamp serialization automatically @@ -636,6 +659,23 @@ def delete_connection( detail=routeApiMsg("Connection not found") ) + # Fire revoked event BEFORE the row disappears so consumers still + # have authority/connection context for observability; purge itself + # targets FileContentIndex rows by connectionId which are unaffected + # by the UserConnection delete. + try: + from modules.shared.callbackRegistry import callbackRegistry + + callbackRegistry.trigger( + "connection.revoked", + connectionId=connectionId, + authority=str(getattr(connection.authority, "value", connection.authority) or ""), + userId=str(currentUser.id), + reason="deleted", + ) + except Exception as _cbErr: + logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr) + # Remove the connection - only need connectionId since permissions are verified interface.removeUserConnection(connectionId) diff --git a/modules/routes/routeDataFiles.py b/modules/routes/routeDataFiles.py index 90431ba2..c20f3f3a 100644 --- a/modules/routes/routeDataFiles.py +++ b/modules/routes/routeDataFiles.py @@ -12,7 +12,6 @@ from modules.auth import limiter, getCurrentUser, getRequestContext, RequestCont # Import interfaces import modules.interfaces.interfaceDbManagement as interfaceDbManagement from modules.datamodels.datamodelFiles import FileItem, FilePreview -from modules.datamodels.datamodelFileFolder import FileFolder from modules.shared.attributeUtils import getModelAttributeDefinitions from modules.datamodels.datamodelUam import User from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict @@ -77,7 +76,7 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): """Background task: pre-scan + extraction + knowledge indexing. Step 1: Structure Pre-Scan (AI-free) -> FileContentIndex (persisted) Step 2: Content extraction via runExtraction -> ContentParts - Step 3: KnowledgeService.indexFile -> chunking + embedding -> Knowledge Store""" + Step 3: KnowledgeService.requestIngestion -> idempotent chunking + embedding -> Knowledge Store""" userId = user.id if hasattr(user, "id") else str(user) try: mgmtInterface = interfaceDbManagement.getInterface(user) @@ -122,9 +121,30 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): f"{contentIndex.totalObjects} objects" ) - # Persist FileContentIndex immediately + # Persist FileContentIndex immediately. + # IMPORTANT: preserve `_ingestion` metadata and `status="indexed"` from any + # prior successful run — otherwise this upsert wipes the idempotency cache + # and requestIngestion cannot detect duplicates (AC4 breaks). from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface knowledgeDb = getKnowledgeInterface() + try: + _existing = knowledgeDb.getFileContentIndex(fileId) + except Exception: + _existing = None + if _existing: + _existingStruct = ( + _existing.get("structure") if isinstance(_existing, dict) + else getattr(_existing, "structure", {}) + ) or {} + _existingStatus = ( + _existing.get("status") if isinstance(_existing, dict) + else getattr(_existing, "status", "") + ) or "" + if "_ingestion" in _existingStruct: + contentIndex.structure = dict(contentIndex.structure or {}) + contentIndex.structure["_ingestion"] = _existingStruct["_ingestion"] + if _existingStatus == "indexed": + contentIndex.status = "indexed" knowledgeDb.upsertFileContentIndex(contentIndex) # Step 2: Content extraction (AI-free, produces ContentParts) @@ -134,7 +154,10 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): extractorRegistry = ExtractorRegistry() chunkerRegistry = ChunkerRegistry() - options = ExtractionOptions() + # mergeStrategy=None: keep per-page / per-section granularity for RAG ingestion. + # The default MergeStrategy concatenates all text parts into a single blob, which + # collapses a 500-page PDF into one ContentChunk and destroys semantic retrieval. + options = ExtractionOptions(mergeStrategy=None) extracted = runExtraction( extractorRegistry, chunkerRegistry, @@ -181,15 +204,21 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): ) knowledgeService = getService("knowledge", ctx) - await knowledgeService.indexFile( - fileId=fileId, - fileName=fileName, - mimeType=mimeType, - userId=userId, - featureInstanceId=str(feature_instance_id) if feature_instance_id else "", - mandateId=str(mandate_id) if mandate_id else "", - contentObjects=contentObjects, - structure=contentIndex.structure, + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="file", + sourceId=fileId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + featureInstanceId=str(feature_instance_id) if feature_instance_id else "", + mandateId=str(mandate_id) if mandate_id else "", + contentObjects=contentObjects, + structure=contentIndex.structure, + provenance={"lane": "upload", "route": "routeDataFiles._autoIndexFile"}, + ) ) # Re-acquire interface after await to avoid stale user context from the singleton @@ -249,7 +278,6 @@ def get_files( try: paginationDict = json.loads(pagination) if paginationDict: - # Normalize pagination dict (handles top-level "search" field) paginationDict = normalize_pagination_dict(paginationDict) paginationParams = PaginationParams(**paginationDict) except (json.JSONDecodeError, ValueError) as e: @@ -257,51 +285,43 @@ def get_files( status_code=400, detail=f"Invalid pagination parameter: {str(e)}" ) - + from modules.routes.routeHelpers import ( handleIdsMode, handleFilterValuesInMemory, + handleGroupingInRequest, applyGroupScopeFilter, ) + import modules.interfaces.interfaceDbApp as _appIface managementInterface = interfaceDbManagement.getInterface( currentUser, mandateId=str(context.mandateId) if context.mandateId else None, featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None ) + appInterface = _appIface.getInterface(currentUser) + groupCtx = handleGroupingInRequest(paginationParams, appInterface, "files/list") + + def _filesToDicts(fileItems): + return [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in fileItems] if mode == "filterValues": if not column: raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues") allFiles = managementInterface.getAllFiles() items = allFiles if isinstance(allFiles, list) else (allFiles.items if hasattr(allFiles, "items") else []) - itemDicts = [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in items] + itemDicts = _filesToDicts(items) enrichRowsWithFkLabels(itemDicts, FileItem) + itemDicts = applyGroupScopeFilter(itemDicts, groupCtx.itemIds) return handleFilterValuesInMemory(itemDicts, column, pagination) if mode == "ids": recordFilter = {"sysCreatedBy": managementInterface.userId} return handleIdsMode(managementInterface.db, FileItem, pagination, recordFilter) - recordFilter = None - if paginationParams and paginationParams.filters and "folderId" in paginationParams.filters: - fVal = paginationParams.filters.get("folderId") - # For a concrete folderId we use recordFilter (exact equality). - # For null / empty (= "root") we keep it in pagination.filters so the - # connector applies `IS NULL OR = ''` – files predating the folderId - # fix were stored with an empty string instead of NULL. - if fVal is None or (isinstance(fVal, str) and fVal.strip() == ""): - paginationParams.filters["folderId"] = None - else: - paginationParams.filters.pop("folderId") - recordFilter = {"folderId": fVal} - - result = managementInterface.getAllFiles(pagination=paginationParams, recordFilter=recordFilter) - - def _filesToDicts(items): - return [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in items] + result = managementInterface.getAllFiles(pagination=paginationParams) if paginationParams: - enriched = enrichRowsWithFkLabels(_filesToDicts(result.items), FileItem) + enriched = applyGroupScopeFilter(enrichRowsWithFkLabels(_filesToDicts(result.items), FileItem), groupCtx.itemIds) return { "items": enriched, "pagination": PaginationMetadata( @@ -312,11 +332,12 @@ def get_files( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": groupCtx.groupTree, } else: items = result if isinstance(result, list) else (result.items if hasattr(result, "items") else [result]) - enriched = enrichRowsWithFkLabels(_filesToDicts(items), FileItem) - return {"items": enriched, "pagination": None} + enriched = applyGroupScopeFilter(enrichRowsWithFkLabels(_filesToDicts(items), FileItem), groupCtx.itemIds) + return {"items": enriched, "pagination": None, "groupTree": groupCtx.groupTree} except HTTPException: raise except Exception as e: @@ -327,6 +348,36 @@ def get_files( ) +def _addFileToGroup(appInterface, fileId: str, groupId: str, contextKey: str = "files/list"): + """Add a file to a group in the persisted groupTree (upsert).""" + from modules.routes.routeHelpers import _collectItemIds + try: + existing = appInterface.getTableGrouping(contextKey) + if not existing: + return + nodes = [n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups] + def _add(nds): + for nd in nds: + nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + if nid == groupId: + itemIds = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", [])) + if fileId not in itemIds: + itemIds.append(fileId) + if isinstance(nd, dict): + nd["itemIds"] = itemIds + else: + nd.itemIds = itemIds + return True + subs = nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []) + if _add(subs): + return True + return False + _add(nodes) + appInterface.upsertTableGrouping(contextKey, nodes) + except Exception as e: + logger.warning(f"_addFileToGroup failed: {e}") + + @router.post("/upload", status_code=status.HTTP_201_CREATED) @limiter.limit("10/minute") async def upload_file( @@ -334,7 +385,7 @@ async def upload_file( file: UploadFile = File(...), workflowId: Optional[str] = Form(None), featureInstanceId: Optional[str] = Form(None), - folderId: Optional[str] = Form(None), + groupId: Optional[str] = Form(None), currentUser: User = Depends(getCurrentUser), context: RequestContext = Depends(getRequestContext), ) -> JSONResponse: @@ -358,31 +409,22 @@ async def upload_file( status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, detail=f"File too large. Maximum size: {interfaceDbManagement.APP_CONFIG.get('File_Management_MAX_UPLOAD_SIZE_MB')}MB" ) - - # Normalize folderId: empty string / "null" / "root" → None (root folder) - normalizedFolderId: Optional[str] = folderId - if isinstance(normalizedFolderId, str): - trimmed = normalizedFolderId.strip() - if not trimmed or trimmed.lower() in {"null", "none", "root"}: - normalizedFolderId = None - else: - normalizedFolderId = trimmed # Save file via LucyDOM interface in the database fileItem, duplicateType = managementInterface.saveUploadedFile( - fileContent, file.filename, folderId=normalizedFolderId + fileContent, file.filename ) if featureInstanceId and not fileItem.featureInstanceId: managementInterface.updateFile(fileItem.id, {"featureInstanceId": featureInstanceId}) fileItem.featureInstanceId = featureInstanceId - # For exact duplicates we keep the existing record, but move it into the - # target folder so the user actually sees their upload land where they expect. - if duplicateType == "exact_duplicate" and normalizedFolderId != getattr(fileItem, "folderId", None): - managementInterface.updateFile(fileItem.id, {"folderId": normalizedFolderId}) - fileItem.folderId = normalizedFolderId - + # Add to group if groupId was provided + if groupId: + import modules.interfaces.interfaceDbApp as _appIface + appInterface = _appIface.getInterface(currentUser) + _addFileToGroup(appInterface, fileItem.id, groupId) + # Determine response message based on duplicate type if duplicateType == "exact_duplicate": message = f"File '{file.filename}' already exists with identical content. Reusing existing file." @@ -447,347 +489,6 @@ async def upload_file( detail=f"Error during file upload: {str(e)}" ) -# ── Folder endpoints (MUST be before /{fileId} catch-all) ───────────────────── - -@router.get("/folders", response_model=List[Dict[str, Any]]) -@limiter.limit("30/minute") -def list_folders( - request: Request, - parentId: Optional[str] = Query(None, description="Parent folder ID (omit for all folders)"), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> List[Dict[str, Any]]: - """List folders for the current user.""" - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - if parentId is not None: - return mgmt.listFolders(parentId=parentId) - return mgmt.listFolders() - except Exception as e: - logger.error(f"Error listing folders: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/folders", status_code=status.HTTP_201_CREATED) -@limiter.limit("10/minute") -def create_folder( - request: Request, - body: Dict[str, Any] = Body(...), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Create a new folder.""" - name = body.get("name", "") - parentId = body.get("parentId") - if not name: - raise HTTPException(status_code=400, detail=routeApiMsg("name is required")) - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - return mgmt.createFolder(name=name, parentId=parentId) - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - except Exception as e: - logger.error(f"Error creating folder: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.put("/folders/{folderId}") -@limiter.limit("10/minute") -def rename_folder( - request: Request, - folderId: str = Path(...), - body: Dict[str, Any] = Body(...), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Rename a folder.""" - newName = body.get("name", "") - if not newName: - raise HTTPException(status_code=400, detail=routeApiMsg("name is required")) - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - mgmt.renameFolder(folderId, newName) - return {"success": True, "folderId": folderId, "name": newName} - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - except Exception as e: - logger.error(f"Error renaming folder: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.delete("/folders/{folderId}") -@limiter.limit("10/minute") -def delete_folder( - request: Request, - folderId: str = Path(...), - recursive: bool = Query(False, description="Delete folder contents recursively"), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Delete a folder. Use recursive=true to delete non-empty folders.""" - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - return mgmt.deleteFolder(folderId, recursive=recursive) - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - except Exception as e: - logger.error(f"Error deleting folder: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/folders/{folderId}/move") -@limiter.limit("10/minute") -def move_folder( - request: Request, - folderId: str = Path(...), - body: Dict[str, Any] = Body(...), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Move a folder to a new parent.""" - targetParentId = body.get("targetParentId") - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - mgmt.moveFolder(folderId, targetParentId) - return {"success": True, "folderId": folderId, "parentId": targetParentId} - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - except Exception as e: - logger.error(f"Error moving folder: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.patch("/folders/{folderId}/scope") -@limiter.limit("10/minute") -def _updateFolderScope( - request: Request, - folderId: str = Path(..., description="ID of the folder"), - scope: str = Body(..., embed=True), - context: RequestContext = Depends(getRequestContext), -) -> Dict[str, Any]: - """Update the scope of a folder. Propagates to all files inside (recursively). Global scope requires sysAdmin.""" - validScopes = {"personal", "featureInstance", "mandate", "global"} - if scope not in validScopes: - raise HTTPException(status_code=400, detail=f"Invalid scope: {scope}. Must be one of {validScopes}") - if scope == "global" and not context.isSysAdmin: - raise HTTPException(status_code=403, detail=routeApiMsg("Only sysadmins can set global scope")) - try: - mgmt = interfaceDbManagement.getInterface( - context.user, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - folder = mgmt.getFolder(folderId) - if not folder: - raise HTTPException(status_code=404, detail=routeApiMsg("Folder not found")) - mgmt.updateFolder(folderId, {"scope": scope}) - fileIds = _collectFolderFileIds(mgmt, folderId) - for fid in fileIds: - try: - mgmt.updateFile(fid, {"scope": scope}) - except Exception as e: - logger.error("Folder scope propagation: failed to update file %s: %s", fid, e) - logger.info("Updated scope=%s for folder %s: %d files affected", scope, folderId, len(fileIds)) - return {"folderId": folderId, "scope": scope, "filesUpdated": len(fileIds)} - except HTTPException: - raise - except Exception as e: - logger.error(f"Error updating folder scope: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.patch("/folders/{folderId}/neutralize") -@limiter.limit("10/minute") -def updateFolderNeutralize( - request: Request, - background_tasks: BackgroundTasks, - folderId: str = Path(..., description="ID of the folder"), - neutralize: bool = Body(..., embed=True), - context: RequestContext = Depends(getRequestContext), -) -> Dict[str, Any]: - """Toggle neutralization on a folder. Propagates to all files inside (recursively). - - When turning ON: all files in the folder get ``neutralize=True``, their - knowledge indexes are purged synchronously, and background re-indexing - is triggered. - When turning OFF: files revert to ``neutralize=False`` unless they were - individually marked (not implemented yet -- all are reverted). - """ - try: - mgmt = interfaceDbManagement.getInterface( - context.user, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - - folder = mgmt.getFolder(folderId) - if not folder: - raise HTTPException(status_code=404, detail=routeApiMsg("Folder not found")) - - mgmt.updateFolder(folderId, {"neutralize": neutralize}) - - fileIds = _collectFolderFileIds(mgmt, folderId) - logger.info("Folder neutralize toggle %s for folder %s: %d files affected", neutralize, folderId, len(fileIds)) - - from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface - knowledgeDb = getKnowledgeInterface() - - for fid in fileIds: - try: - mgmt.updateFile(fid, {"neutralize": neutralize}) - if neutralize: - try: - knowledgeDb.deleteFileContentIndex(fid) - except Exception as e: - logger.warning("Folder neutralize: failed to purge index for file %s: %s", fid, e) - else: - try: - from modules.datamodels.datamodelKnowledge import FileContentIndex - indices = knowledgeDb.db.getRecordset(FileContentIndex, recordFilter={"id": fid}) - for idx in indices: - idxId = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None) - if idxId: - knowledgeDb.db.recordModify(FileContentIndex, idxId, { - "neutralizationStatus": "original", - "isNeutralized": False, - }) - except Exception as e: - logger.warning("Folder neutralize OFF: metadata update failed for %s: %s", fid, e) - except Exception as e: - logger.error("Folder neutralize: failed to update file %s: %s", fid, e) - - for fid in fileIds: - fileMeta = mgmt.getFile(fid) - if fileMeta: - fn = fileMeta.fileName if hasattr(fileMeta, "fileName") else fileMeta.get("fileName", "") - mt = fileMeta.mimeType if hasattr(fileMeta, "mimeType") else fileMeta.get("mimeType", "") - - async def _reindex(fileId=fid, fileName=fn, mimeType=mt): - try: - await _autoIndexFile(fileId=fileId, fileName=fileName, mimeType=mimeType, user=context.user) - except Exception as ex: - logger.error("Folder neutralize re-index failed for %s: %s", fileId, ex) - - background_tasks.add_task(_reindex) - - return {"folderId": folderId, "neutralize": neutralize, "filesUpdated": len(fileIds)} - except HTTPException: - raise - except Exception as e: - logger.error(f"Error updating folder neutralize flag: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -def _collectFolderFileIds(mgmt, folderId: str) -> List[str]: - """Recursively collect all file IDs in a folder and its sub-folders.""" - fileIds = [] - try: - files = mgmt.listFiles(folderId=folderId) - if isinstance(files, dict): - files = files.get("files", []) - for f in (files or []): - fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None) - if fid: - fileIds.append(fid) - except Exception as e: - logger.warning("_collectFolderFileIds: listFiles failed for folder %s: %s", folderId, e) - - try: - subFolders = mgmt.listFolders(parentId=folderId) - for sf in (subFolders or []): - sfId = sf.get("id") if isinstance(sf, dict) else getattr(sf, "id", None) - if sfId: - fileIds.extend(_collectFolderFileIds(mgmt, sfId)) - except Exception as e: - logger.warning("_collectFolderFileIds: listFolders failed for folder %s: %s", folderId, e) - - return fileIds - - -@router.get("/folders/{folderId}/download") -@limiter.limit("10/minute") -def download_folder( - request: Request, - folderId: str = Path(..., description="ID of the folder to download as ZIP"), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Response: - """Download a folder (including subfolders) as a ZIP archive.""" - import io - import zipfile - import urllib.parse - - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - - folder = mgmt.getFolder(folderId) - if not folder: - raise HTTPException(status_code=404, detail=f"Folder {folderId} not found") - - folderName = folder.get("name", "download") - - def _collectFiles(parentId: str, pathPrefix: str): - """Recursively collect (zipPath, fileId) tuples.""" - entries = [] - for f in mgmt._getFilesByCurrentUser(recordFilter={"folderId": parentId}): - fname = f.get("fileName") or f.get("name") or f.get("id", "file") - entries.append((f"{pathPrefix}{fname}", f["id"])) - for sub in mgmt.listFolders(parentId=parentId): - subName = sub.get("name", sub["id"]) - entries.extend(_collectFiles(sub["id"], f"{pathPrefix}{subName}/")) - return entries - - fileEntries = _collectFiles(folderId, "") - if not fileEntries: - raise HTTPException(status_code=404, detail=routeApiMsg("Folder is empty")) - - buf = io.BytesIO() - with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: - for zipPath, fileId in fileEntries: - data = mgmt.getFileData(fileId) - if data: - zf.writestr(zipPath, data) - - buf.seek(0) - zipBytes = buf.getvalue() - encodedName = urllib.parse.quote(f"{folderName}.zip") - - return Response( - content=zipBytes, - media_type="application/zip", - headers={ - "Content-Disposition": f"attachment; filename*=UTF-8''{encodedName}" - } - ) - except HTTPException: - raise - except Exception as e: - logger.error(f"Error downloading folder as ZIP: {e}") - raise HTTPException(status_code=500, detail=f"Error downloading folder: {str(e)}") @router.post("/batch-delete") @@ -798,13 +499,11 @@ def batch_delete_items( currentUser: User = Depends(getCurrentUser), context: RequestContext = Depends(getRequestContext) ) -> Dict[str, Any]: - """Batch delete files/folders with a single SQL-backed operation per type.""" + """Batch delete files.""" fileIds = body.get("fileIds") or [] - folderIds = body.get("folderIds") or [] - recursiveFolders = bool(body.get("recursiveFolders", True)) - if not isinstance(fileIds, list) or not isinstance(folderIds, list): - raise HTTPException(status_code=400, detail=routeApiMsg("fileIds and folderIds must be arrays")) + if not isinstance(fileIds, list): + raise HTTPException(status_code=400, detail=routeApiMsg("fileIds must be an array")) try: mgmt = interfaceDbManagement.getInterface( @@ -813,17 +512,12 @@ def batch_delete_items( featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, ) - result = {"deletedFiles": 0, "deletedFolders": 0} + result = {"deletedFiles": 0} if fileIds: fileResult = mgmt.deleteFilesBatch(fileIds) result["deletedFiles"] += fileResult.get("deletedFiles", 0) - if folderIds: - folderResult = mgmt.deleteFoldersBatch(folderIds, recursive=recursiveFolders) - result["deletedFiles"] += folderResult.get("deletedFiles", 0) - result["deletedFolders"] += folderResult.get("deletedFolders", 0) - return result except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) @@ -832,45 +526,189 @@ def batch_delete_items( raise HTTPException(status_code=500, detail=str(e)) -@router.post("/batch-move") -@limiter.limit("10/minute") -def batch_move_items( - request: Request, - body: Dict[str, Any] = Body(...), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Batch move files/folders with a single SQL-backed operation per type.""" - fileIds = body.get("fileIds") or [] - folderIds = body.get("folderIds") or [] - targetFolderId = body.get("targetFolderId") - targetParentId = body.get("targetParentId") - - if not isinstance(fileIds, list) or not isinstance(folderIds, list): - raise HTTPException(status_code=400, detail=routeApiMsg("fileIds and folderIds must be arrays")) +# ── Group bulk endpoints ────────────────────────────────────────────────────── +def _get_group_item_ids(contextKey: str, groupId: str, appInterface) -> set: + """Collect all file IDs in a group and its sub-groups from the stored groupTree.""" + from modules.routes.routeHelpers import _collectItemIds try: - mgmt = interfaceDbManagement.getInterface( + existing = appInterface.getTableGrouping(contextKey) + if not existing: + return set() + nodes = [n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups] + result = _collectItemIds(nodes, groupId) + return result or set() + except Exception as e: + logger.error(f"_get_group_item_ids failed for groupId={groupId}: {e}") + return set() + + +@router.patch("/groups/{groupId}/scope") +@limiter.limit("60/minute") +def patch_group_scope( + request: Request, + groupId: str = Path(..., description="Group ID"), + body: dict = Body(...), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + """Set scope for all files in a group (recursive).""" + scope = body.get("scope") + if not scope: + raise HTTPException(status_code=400, detail="scope is required") + try: + import modules.interfaces.interfaceDbApp as _appIface + managementInterface = interfaceDbManagement.getInterface( currentUser, mandateId=str(context.mandateId) if context.mandateId else None, featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, ) - - result = {"movedFiles": 0, "movedFolders": 0} - - if fileIds: - fileResult = mgmt.moveFilesBatch(fileIds, targetFolderId=targetFolderId) - result["movedFiles"] += fileResult.get("movedFiles", 0) - - if folderIds: - folderResult = mgmt.moveFoldersBatch(folderIds, targetParentId=targetParentId) - result["movedFolders"] += folderResult.get("movedFolders", 0) - - return result - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) + appInterface = _appIface.getInterface(currentUser) + fileIds = _get_group_item_ids("files/list", groupId, appInterface) + updated = 0 + for fid in fileIds: + try: + managementInterface.updateFile(fid, {"scope": scope}) + updated += 1 + except Exception as e: + logger.error(f"patch_group_scope: failed to update file {fid}: {e}") + return {"groupId": groupId, "scope": scope, "filesUpdated": updated} + except HTTPException: + raise except Exception as e: - logger.error(f"Error in batch move: {e}") + logger.error(f"patch_group_scope error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.patch("/groups/{groupId}/neutralize") +@limiter.limit("60/minute") +def patch_group_neutralize( + request: Request, + groupId: str = Path(..., description="Group ID"), + body: dict = Body(...), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + """Toggle neutralize for all files in a group (recursive, incl. knowledge purge/reindex).""" + neutralize = body.get("neutralize") + if neutralize is None: + raise HTTPException(status_code=400, detail="neutralize is required") + try: + import modules.interfaces.interfaceDbApp as _appIface + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + appInterface = _appIface.getInterface(currentUser) + fileIds = _get_group_item_ids("files/list", groupId, appInterface) + updated = 0 + for fid in fileIds: + try: + managementInterface.updateFile(fid, {"neutralize": neutralize}) + if not neutralize: + try: + from modules.interfaces import interfaceDbKnowledge + kIface = interfaceDbKnowledge.getInterface(currentUser) + kIface.purgeFileKnowledge(fid) + except Exception as ke: + logger.warning(f"patch_group_neutralize: knowledge purge failed for {fid}: {ke}") + updated += 1 + except Exception as e: + logger.error(f"patch_group_neutralize: failed for file {fid}: {e}") + return {"groupId": groupId, "neutralize": neutralize, "filesUpdated": updated} + except HTTPException: + raise + except Exception as e: + logger.error(f"patch_group_neutralize error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/groups/{groupId}/download") +@limiter.limit("20/minute") +async def download_group_zip( + request: Request, + groupId: str = Path(..., description="Group ID"), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + """Download all files in a group as a ZIP archive.""" + import io, zipfile + try: + import modules.interfaces.interfaceDbApp as _appIface + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + appInterface = _appIface.getInterface(currentUser) + fileIds = _get_group_item_ids("files/list", groupId, appInterface) + if not fileIds: + raise HTTPException(status_code=404, detail="Group not found or empty") + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + for fid in fileIds: + try: + fileMeta = managementInterface.getFile(fid) + fileData = managementInterface.getFileData(fid) + if fileMeta and fileData: + name = (fileMeta.get("fileName") if isinstance(fileMeta, dict) else getattr(fileMeta, "fileName", fid)) or fid + zf.writestr(name, fileData) + except Exception as fe: + logger.warning(f"download_group_zip: skipping file {fid}: {fe}") + buf.seek(0) + from fastapi.responses import StreamingResponse + return StreamingResponse( + buf, + media_type="application/zip", + headers={"Content-Disposition": f'attachment; filename="group-{groupId}.zip"'}, + ) + except HTTPException: + raise + except Exception as e: + logger.error(f"download_group_zip error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.delete("/groups/{groupId}") +@limiter.limit("30/minute") +def delete_group( + request: Request, + groupId: str = Path(..., description="Group ID"), + deleteItems: bool = Query(False, description="If true, also delete all files in the group"), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + """Remove a group from the groupTree. Optionally delete all its files.""" + try: + import modules.interfaces.interfaceDbApp as _appIface + appInterface = _appIface.getInterface(currentUser) + fileIds = _get_group_item_ids("files/list", groupId, appInterface) + # Remove group from tree + existing = appInterface.getTableGrouping("files/list") + if existing: + from modules.routes.routeHelpers import _removeGroupFromTree + newRoots = _removeGroupFromTree([n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups], groupId) + appInterface.upsertTableGrouping("files/list", newRoots) + # Optionally delete files + deletedFiles = 0 + if deleteItems: + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + for fid in fileIds: + try: + managementInterface.deleteFile(fid) + deletedFiles += 1 + except Exception as e: + logger.error(f"delete_group: failed to delete file {fid}: {e}") + return {"groupId": groupId, "deletedFiles": deletedFiles} + except HTTPException: + raise + except Exception as e: + logger.error(f"delete_group error: {e}") raise HTTPException(status_code=500, detail=str(e)) @@ -1071,7 +909,7 @@ def update_file( ) -> FileItem: """Update file info""" try: - _EDITABLE_FIELDS = {"fileName", "scope", "tags", "description", "folderId", "neutralize"} + _EDITABLE_FIELDS = {"fileName", "scope", "tags", "description", "neutralize"} safeData = {k: v for k, v in file_info.items() if k in _EDITABLE_FIELDS} if not safeData: raise HTTPException(status_code=400, detail=routeApiMsg("No editable fields provided")) @@ -1226,37 +1064,3 @@ def preview_file( ) -@router.post("/{fileId}/move") -@limiter.limit("10/minute") -def move_file( - request: Request, - fileId: str = Path(...), - body: Dict[str, Any] = Body(...), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Move a file to a different folder.""" - targetFolderId = body.get("targetFolderId") - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - mgmt.updateFile(fileId, {"folderId": targetFolderId}) - - if targetFolderId: - try: - targetFolder = mgmt.getFolder(targetFolderId) - folderNeut = (targetFolder.get("neutralize") if isinstance(targetFolder, dict) - else getattr(targetFolder, "neutralize", False)) if targetFolder else False - if folderNeut: - mgmt.updateFile(fileId, {"neutralize": True}) - logger.info("File %s moved to neutralized folder %s — inherited neutralize=True", fileId, targetFolderId) - except Exception as e: - logger.warning("File move: folder neutralize inheritance check failed for %s: %s", fileId, e) - - return {"success": True, "fileId": fileId, "folderId": targetFolderId} - except Exception as e: - logger.error(f"Error moving file: {e}") - raise HTTPException(status_code=500, detail=str(e)) diff --git a/modules/routes/routeDataMandates.py b/modules/routes/routeDataMandates.py index ef058ed9..47eaee02 100644 --- a/modules/routes/routeDataMandates.py +++ b/modules/routes/routeDataMandates.py @@ -112,8 +112,8 @@ def get_mandates( status_code=status.HTTP_403_FORBIDDEN, detail=routeApiMsg("Admin role required") ) - - # Parse pagination parameter + + # Parse pagination parameter early — needed for grouping in all modes paginationParams = None if pagination: try: @@ -126,14 +126,24 @@ def get_mandates( status_code=400, detail=f"Invalid pagination parameter: {str(e)}" ) - + from modules.routes.routeHelpers import ( handleFilterValuesInMemory, handleIdsInMemory, handleFilterValuesMode, handleIdsMode, parseCrossFilterPagination, + handleGroupingInRequest, applyGroupScopeFilter, ) appInterface = interfaceDbApp.getRootInterface() + groupCtx = handleGroupingInRequest(paginationParams, appInterface, "mandates") + + def _mandateItemsForAdmin(): + items = [] + for mid in adminMandateIds: + m = appInterface.getMandate(mid) + if m and getattr(m, "enabled", True): + items.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m)) + return items if mode == "filterValues": if not column: @@ -144,54 +154,42 @@ def get_mandates( values = appInterface.db.getDistinctColumnValues(Mandate, column, crossPagination) return JSONResponse(content=sorted(values, key=lambda v: str(v).lower())) else: - mandateItems = [] - for mid in adminMandateIds: - m = appInterface.getMandate(mid) - if m and getattr(m, "enabled", True): - mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m)) + mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds) return handleFilterValuesInMemory(mandateItems, column, pagination) if mode == "ids": if isPlatformAdmin: return handleIdsMode(appInterface.db, Mandate, pagination) else: - mandateItems = [] - for mid in adminMandateIds: - m = appInterface.getMandate(mid) - if m and getattr(m, "enabled", True): - mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m)) + mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds) return handleIdsInMemory(mandateItems, pagination) if isPlatformAdmin: result = appInterface.getAllMandates(pagination=paginationParams) - else: - allMandates = [] - for mandateId in adminMandateIds: - mandate = appInterface.getMandate(mandateId) - if mandate and getattr(mandate, "enabled", True): - mandateDict = mandate if isinstance(mandate, dict) else mandate.model_dump() if hasattr(mandate, 'model_dump') else vars(mandate) - allMandates.append(mandateDict) - result = allMandates - paginationParams = None - - if paginationParams and hasattr(result, 'items'): - return PaginatedResponse( - items=result.items, - pagination=PaginationMetadata( - currentPage=paginationParams.page, - pageSize=paginationParams.pageSize, - totalItems=result.totalItems, - totalPages=result.totalPages, - sort=paginationParams.sort, - filters=paginationParams.filters + items = result.items if hasattr(result, 'items') else (result if isinstance(result, list) else []) + items = applyGroupScopeFilter( + [i.model_dump() if hasattr(i, 'model_dump') else (i if isinstance(i, dict) else vars(i)) for i in items], + groupCtx.itemIds, + ) + if paginationParams and hasattr(result, 'items'): + return PaginatedResponse( + items=items, + pagination=PaginationMetadata( + currentPage=paginationParams.page, + pageSize=paginationParams.pageSize, + totalItems=result.totalItems, + totalPages=result.totalPages, + sort=paginationParams.sort, + filters=paginationParams.filters + ), + groupTree=groupCtx.groupTree, ) - ) + else: + return PaginatedResponse(items=items, pagination=None, groupTree=groupCtx.groupTree) else: - items = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else result) - return PaginatedResponse( - items=items, - pagination=None - ) + mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds) + return PaginatedResponse(items=mandateItems, pagination=None, groupTree=groupCtx.groupTree) + except HTTPException: raise except Exception as e: diff --git a/modules/routes/routeDataPrompts.py b/modules/routes/routeDataPrompts.py index ee99b912..84559ebb 100644 --- a/modules/routes/routeDataPrompts.py +++ b/modules/routes/routeDataPrompts.py @@ -44,27 +44,15 @@ def get_prompts( - filterValues: distinct values for a column (cross-filtered) - ids: all IDs matching current filters """ - from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels + from modules.routes.routeHelpers import ( + handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels, + handleGroupingInRequest, applyGroupScopeFilter, + ) + from modules.interfaces.interfaceDbApp import getInterface as getAppInterface - def _promptsToEnrichedDicts(promptItems): - dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems] - enrichRowsWithFkLabels(dicts, Prompt) - return dicts - - if mode == "filterValues": - if not column: - raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues") - managementInterface = interfaceDbManagement.getInterface(currentUser) - result = managementInterface.getAllPrompts(pagination=None) - items = _promptsToEnrichedDicts(result) - return handleFilterValuesInMemory(items, column, pagination) - - if mode == "ids": - managementInterface = interfaceDbManagement.getInterface(currentUser) - result = managementInterface.getAllPrompts(pagination=None) - items = _promptsToEnrichedDicts(result) - return handleIdsInMemory(items, pagination) + CONTEXT_KEY = "prompts" + # Parse pagination params early — needed for grouping in all modes paginationParams = None if pagination: try: @@ -74,12 +62,35 @@ def get_prompts( paginationParams = PaginationParams(**paginationDict) except (json.JSONDecodeError, ValueError) as e: raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}") - + + appInterface = getAppInterface(currentUser) + groupCtx = handleGroupingInRequest(paginationParams, appInterface, CONTEXT_KEY) + + def _promptsToEnrichedDicts(promptItems): + dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems] + enrichRowsWithFkLabels(dicts, Prompt) + return dicts + managementInterface = interfaceDbManagement.getInterface(currentUser) + + if mode == "filterValues": + if not column: + raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues") + result = managementInterface.getAllPrompts(pagination=None) + items = _promptsToEnrichedDicts(result) + items = applyGroupScopeFilter(items, groupCtx.itemIds) + return handleFilterValuesInMemory(items, column, pagination) + + if mode == "ids": + result = managementInterface.getAllPrompts(pagination=None) + items = _promptsToEnrichedDicts(result) + items = applyGroupScopeFilter(items, groupCtx.itemIds) + return handleIdsInMemory(items, pagination) + result = managementInterface.getAllPrompts(pagination=paginationParams) - + if paginationParams: - items = _promptsToEnrichedDicts(result.items) + items = applyGroupScopeFilter(_promptsToEnrichedDicts(result.items), groupCtx.itemIds) return { "items": items, "pagination": PaginationMetadata( @@ -90,12 +101,14 @@ def get_prompts( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": groupCtx.groupTree, } else: - items = _promptsToEnrichedDicts(result) + items = applyGroupScopeFilter(_promptsToEnrichedDicts(result), groupCtx.itemIds) return { "items": items, "pagination": None, + "groupTree": groupCtx.groupTree, } diff --git a/modules/routes/routeDataUsers.py b/modules/routes/routeDataUsers.py index 6d72b763..25d20c39 100644 --- a/modules/routes/routeDataUsers.py +++ b/modules/routes/routeDataUsers.py @@ -208,6 +208,21 @@ def get_users( - GET /api/users/ (no pagination - returns all users in mandate) - GET /api/users/?pagination={"page":1,"pageSize":10,"sort":[]} """ + # Parse pagination early — needed for grouping in all modes + _paginationParams = None + if pagination: + try: + _pd = json.loads(pagination) + if _pd: + _pd = normalize_pagination_dict(_pd) + _paginationParams = PaginationParams(**_pd) + except (json.JSONDecodeError, ValueError) as e: + raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}") + + from modules.routes.routeHelpers import handleGroupingInRequest as _handleGrouping, applyGroupScopeFilter as _applyGroupScope + _appInterfaceForGrouping = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId) + _groupCtx = _handleGrouping(_paginationParams, _appInterfaceForGrouping, "users") + if mode == "filterValues": if not column: raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues") @@ -217,27 +232,15 @@ def get_users( return _getUserFilterOrIds(context, pagination, idsMode=True) try: - paginationParams = None - if pagination: - try: - paginationDict = json.loads(pagination) - if paginationDict: - paginationDict = normalize_pagination_dict(paginationDict) - paginationParams = PaginationParams(**paginationDict) - except (json.JSONDecodeError, ValueError) as e: - raise HTTPException( - status_code=400, - detail=f"Invalid pagination parameter: {str(e)}" - ) - - appInterface = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId) + paginationParams = _paginationParams + appInterface = _appInterfaceForGrouping if context.mandateId: # Get users for specific mandate using getUsersByMandate result = appInterface.getUsersByMandate(str(context.mandateId), paginationParams) - + if paginationParams and hasattr(result, 'items'): - enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User) + enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds) return { "items": enriched, "pagination": PaginationMetadata( @@ -248,17 +251,18 @@ def get_users( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": _groupCtx.groupTree, } else: users = result if isinstance(result, list) else result.items if hasattr(result, 'items') else [] - enriched = enrichRowsWithFkLabels(_usersToDicts(users), User) - return {"items": enriched, "pagination": None} + enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds) + return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree} elif context.isPlatformAdmin: # PlatformAdmin without mandateId — DB-level pagination via interface result = appInterface.getAllUsers(paginationParams) - + if paginationParams and hasattr(result, 'items'): - enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User) + enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds) return { "items": enriched, "pagination": PaginationMetadata( @@ -269,11 +273,12 @@ def get_users( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": _groupCtx.groupTree, } else: users = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else []) - enriched = enrichRowsWithFkLabels(_usersToDicts(users), User) - return {"items": enriched, "pagination": None} + enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds) + return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree} else: # Non-SysAdmin without mandateId: aggregate users across all admin mandates rootInterface = getRootInterface() @@ -313,16 +318,16 @@ def get_users( ] from modules.routes.routeHelpers import applyFiltersAndSort as _applyFiltersAndSortHelper - filteredUsers = _applyFiltersAndSortHelper(allUsers, paginationParams) + filteredUsers = _applyGroupScope(_applyFiltersAndSortHelper(allUsers, paginationParams), _groupCtx.itemIds) enriched = enrichRowsWithFkLabels(filteredUsers, User) - + if paginationParams: import math totalItems = len(enriched) totalPages = math.ceil(totalItems / paginationParams.pageSize) if totalItems > 0 else 0 startIdx = (paginationParams.page - 1) * paginationParams.pageSize endIdx = startIdx + paginationParams.pageSize - + return { "items": enriched[startIdx:endIdx], "pagination": PaginationMetadata( @@ -333,9 +338,10 @@ def get_users( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": _groupCtx.groupTree, } else: - return {"items": enriched, "pagination": None} + return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree} except HTTPException: raise except Exception as e: diff --git a/modules/routes/routeHelpers.py b/modules/routes/routeHelpers.py index 37bfa3b2..9e8644ca 100644 --- a/modules/routes/routeHelpers.py +++ b/modules/routes/routeHelpers.py @@ -701,3 +701,157 @@ def paginateInMemory( offset = (paginationParams.page - 1) * paginationParams.pageSize pageItems = items[offset:offset + paginationParams.pageSize] return pageItems, totalItems + + +# --------------------------------------------------------------------------- +# Table Grouping helpers +# --------------------------------------------------------------------------- + +from dataclasses import dataclass, field as dc_field + + +@dataclass +class GroupingContext: + """ + Result of handleGroupingInRequest. + Carries the group tree for the response and the resolved item-ID set for + group-scope filtering (None = no active group scope). + """ + groupTree: Optional[list] # List[TableGroupNode] serialised as dicts — for response + itemIds: Optional[set] # Set[str] when groupId was set, else None + + +def _collectItemIds(nodes: list, groupId: str) -> Optional[set]: + """ + Recursively search *nodes* for a node whose id == groupId and collect + all itemIds from it and all its descendant subGroups. + Returns None if the group is not found. + """ + for node in nodes: + nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None) + if nodeId == groupId: + ids: set = set() + _collectAllIds(node, ids) + return ids + subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", []) + result = _collectItemIds(subGroups, groupId) + if result is not None: + return result + return None + + +def _collectAllIds(node, ids: set) -> None: + """Collect itemIds from a node and all its descendants into ids.""" + nodeItemIds = node.get("itemIds", []) if isinstance(node, dict) else getattr(node, "itemIds", []) + for iid in nodeItemIds: + ids.add(str(iid)) + subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", []) + for child in subGroups: + _collectAllIds(child, ids) + + +def _removeGroupFromTree(nodes: list, groupId: str) -> list: + """Remove a group node (and all descendants) from the tree by id.""" + result = [] + for node in nodes: + nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None) + if nodeId == groupId: + continue # skip this node (remove it) + subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", []) + filtered_sub = _removeGroupFromTree(subGroups, groupId) + if isinstance(node, dict): + node = {**node, "subGroups": filtered_sub} + result.append(node) + return result + + +def handleGroupingInRequest( + paginationParams: Optional[PaginationParams], + interface, + contextKey: str, +) -> GroupingContext: + """ + Central grouping handler — call at the start of every list route that + supports table grouping. + + Steps (in order): + 1. If paginationParams.saveGroupTree is set: + persist the new tree via interface.upsertTableGrouping, then clear + saveGroupTree from paginationParams so it is not treated as a filter. + 2. Load the current group tree from the DB (used in step 3 and response). + 3. If paginationParams.groupId is set: + resolve it to a Set[str] of itemIds (including all sub-groups), + then clear groupId from paginationParams so it is not treated as a + normal filter field. + 4. Return a GroupingContext with groupTree (for the response) and itemIds + (for applyGroupScopeFilter). + + The caller does NOT need to handle any grouping logic itself — just call + applyGroupScopeFilter(items, groupCtx.itemIds) and embed groupCtx.groupTree + in the response dict. + """ + from modules.datamodels.datamodelPagination import TableGroupNode + + groupTree = None + itemIds = None + + if paginationParams is None: + try: + existing = interface.getTableGrouping(contextKey) + if existing: + groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups] + except Exception as e: + logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}") + return GroupingContext(groupTree=groupTree, itemIds=None) + + # Step 1: persist saveGroupTree if present + if paginationParams.saveGroupTree is not None: + try: + saved = interface.upsertTableGrouping(contextKey, paginationParams.saveGroupTree) + groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in saved.rootGroups] + except Exception as e: + logger.error(f"handleGroupingInRequest: upsertTableGrouping failed: {e}") + paginationParams.saveGroupTree = None + + # Step 2: load current tree (only if not already set from save above) + if groupTree is None: + try: + existing = interface.getTableGrouping(contextKey) + if existing: + groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups] + except Exception as e: + logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}") + + # Step 3: resolve groupId to itemIds set + if paginationParams.groupId is not None: + targetGroupId = paginationParams.groupId + paginationParams.groupId = None # remove so it is not treated as a normal filter + if groupTree: + itemIds = _collectItemIds(groupTree, targetGroupId) + if itemIds is None: + logger.warning( + f"handleGroupingInRequest: groupId={targetGroupId!r} not found in tree " + f"for contextKey={contextKey!r} — returning empty set" + ) + itemIds = set() # unknown group → show nothing rather than everything + else: + # groupId sent but no tree saved yet → return empty (nothing belongs to any group) + logger.warning( + f"handleGroupingInRequest: groupId={targetGroupId!r} set but no tree exists " + f"for contextKey={contextKey!r} — returning empty set" + ) + itemIds = set() + + return GroupingContext(groupTree=groupTree, itemIds=itemIds) + + +def applyGroupScopeFilter(items: List[Dict[str, Any]], itemIds: Optional[set]) -> List[Dict[str, Any]]: + """ + Filter items to those whose "id" field is in itemIds. + Returns items unchanged when itemIds is None (no active group scope). + Works for both normal list items and for mode=ids / mode=filterValues flows + — call it before handleIdsInMemory / handleFilterValuesInMemory. + """ + if itemIds is None: + return items + return [item for item in items if str(item.get("id", "")) in itemIds] diff --git a/modules/routes/routeSecurityClickup.py b/modules/routes/routeSecurityClickup.py index ca787391..d6f71d20 100644 --- a/modules/routes/routeSecurityClickup.py +++ b/modules/routes/routeSecurityClickup.py @@ -241,6 +241,29 @@ async def auth_connect_callback( ) interface.saveConnectionToken(token) + try: + from modules.shared.callbackRegistry import callbackRegistry + + if connection.knowledgeIngestionEnabled: + callbackRegistry.trigger( + "connection.established", + connectionId=connection.id, + authority=str(getattr(connection.authority, "value", connection.authority) or "clickup"), + userId=str(user.id), + ) + else: + logger.info( + "ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user", + extra={ + "event": "ingestion.connection.bootstrap.skipped", + "connectionId": connection.id, + "authority": "clickup", + "reason": "consent_disabled", + }, + ) + except Exception as _cbErr: + logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr) + return HTMLResponse( content=f""" diff --git a/modules/routes/routeSecurityGoogle.py b/modules/routes/routeSecurityGoogle.py index 523523ee..7b6c1c64 100644 --- a/modules/routes/routeSecurityGoogle.py +++ b/modules/routes/routeSecurityGoogle.py @@ -479,6 +479,29 @@ async def auth_connect_callback( ) interface.saveConnectionToken(token) + try: + from modules.shared.callbackRegistry import callbackRegistry + + if connection.knowledgeIngestionEnabled: + callbackRegistry.trigger( + "connection.established", + connectionId=connection.id, + authority=str(getattr(connection.authority, "value", connection.authority) or "google"), + userId=str(user.id), + ) + else: + logger.info( + "ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user", + extra={ + "event": "ingestion.connection.bootstrap.skipped", + "connectionId": connection.id, + "authority": "google", + "reason": "consent_disabled", + }, + ) + except Exception as _cbErr: + logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr) + return HTMLResponse( content=f""" diff --git a/modules/routes/routeSecurityMsft.py b/modules/routes/routeSecurityMsft.py index cc4cb87b..a2768a2b 100644 --- a/modules/routes/routeSecurityMsft.py +++ b/modules/routes/routeSecurityMsft.py @@ -420,6 +420,29 @@ async def auth_connect_callback( ) interface.saveConnectionToken(token) + try: + from modules.shared.callbackRegistry import callbackRegistry + + if connection.knowledgeIngestionEnabled: + callbackRegistry.trigger( + "connection.established", + connectionId=connection.id, + authority=str(getattr(connection.authority, "value", connection.authority) or "msft"), + userId=str(user.id), + ) + else: + logger.info( + "ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user", + extra={ + "event": "ingestion.connection.bootstrap.skipped", + "connectionId": connection.id, + "authority": "msft", + "reason": "consent_disabled", + }, + ) + except Exception as _cbErr: + logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr) + return HTMLResponse( content=f""" diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py index a48e53b3..62413103 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py @@ -11,8 +11,6 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr from modules.serviceCenter.services.serviceAgent.coreTools._helpers import ( _getOrCreateTempFolder, - _looksLikeBinary, - _resolveFileScope, _MAX_TOOL_RESULT_CHARS, ) @@ -392,65 +390,7 @@ def _registerDocumentTools(registry: ToolRegistry, services): if chunkMime: mimeType = chunkMime - # 2) File not yet indexed -> trigger extraction via ExtractionService, then retry - if not imageData and knowledgeService and not knowledgeService.isFileIndexed(fileId): - try: - chatService = services.chat - fileInfo = chatService.getFileInfo(fileId) - fileContent = chatService.getFileContent(fileId) - if fileContent and fileInfo: - rawData = fileContent.get("data", "") - if isinstance(rawData, str) and len(rawData) > 100: - rawBytes = _b64.b64decode(rawData) - elif isinstance(rawData, bytes): - rawBytes = rawData - else: - rawBytes = None - - if rawBytes: - from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry - from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction - from modules.datamodels.datamodelExtraction import ExtractionOptions - - fileMime = fileInfo.get("mimeType", "application/octet-stream") - fileName = fileInfo.get("fileName", fileId) - extracted = runExtraction( - ExtractorRegistry(), None, - rawBytes, fileName, fileMime, ExtractionOptions(), - ) - - contentObjects = [] - for part in extracted.parts: - tg = (part.typeGroup or "").lower() - ct = "image" if tg == "image" else "text" - if not part.data or not part.data.strip(): - continue - contentObjects.append({ - "contentObjectId": part.id, - "contentType": ct, - "data": part.data, - "contextRef": {"containerPath": fileName, "location": part.label, **(part.metadata or {})}, - }) - - if contentObjects: - _diFiId, _diMId = _resolveFileScope(fileId, context) - await knowledgeService.indexFile( - fileId=fileId, fileName=fileName, mimeType=fileMime, - userId=context.get("userId", ""), contentObjects=contentObjects, - featureInstanceId=_diFiId, - mandateId=_diMId, - ) - - chunks = knowledgeService._knowledgeDb.getContentChunks(fileId) - imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"] - if pageIndex is not None: - imageChunks = [c for c in imageChunks if c.get("contextRef", {}).get("pageIndex") == pageIndex] - if imageChunks: - imageData = imageChunks[0].get("data", "") - except Exception as extractErr: - logger.warning(f"describeImage: on-demand extraction failed: {extractErr}") - - # 3) Direct image file (not a container) - use raw file data + # 2) Direct image file (not a container) - use raw file data if not imageData: chatService = services.chat fileContent = chatService.getFileContent(fileId) @@ -460,7 +400,7 @@ def _registerDocumentTools(registry: ToolRegistry, services): imageData = fileContent.get("data", "") mimeType = fileMimeType - # 4) PDF page rendering: render the requested page as an image via PyMuPDF + # 3) PDF page rendering: render the requested page as an image via PyMuPDF if not imageData: chatService = services.chat fileInfo = chatService.getFileInfo(fileId) if hasattr(chatService, "getFileInfo") else None diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py b/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py index 129de517..37116ee5 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py @@ -1,6 +1,6 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -"""Shared helpers for core agent tools (file scope, binary detection, temp folder).""" +"""Shared helpers for core agent tools (file scope, binary detection, group helpers).""" import logging import uuid @@ -46,39 +46,60 @@ def _looksLikeBinary(data: bytes, sampleSize: int = 1024) -> bool: return nonPrintable / len(sample) > 0.10 -def _getOrCreateInstanceFolder(chatService, featureInstanceId: str, mandateId: str = "") -> Optional[str]: - """Return the folder ID for a feature instance, creating it on first use. - - Delegates to interfaceDbManagement._ensureFeatureInstanceFolder. - AI tools call this when saving a file without an explicit folderId - so that instance-produced files land in a named folder automatically. - """ - try: - dbMgmt = chatService.interfaceDbComponent - return dbMgmt._ensureFeatureInstanceFolder(featureInstanceId, mandateId) - except Exception as e: - logger.warning(f"Could not get/create instance folder for {featureInstanceId}: {e}") - return None - - def _getOrCreateTempFolder(chatService) -> Optional[str]: - """Return the ID of the root-level 'Temp' folder, creating it if it doesn't exist.""" + """Deprecated stub: folder-based organisation has been replaced by grouping. + + Returns None unconditionally so callers skip the (now removed) folderId + assignment. Remove callers incrementally and delete this stub afterwards. + """ + logger.debug("_getOrCreateTempFolder called – folder support removed, returning None") + return None + + +async def _getOrCreateInstanceGroup( + appInterface, + featureInstanceId: str, + contextKey: str = "files/list", +) -> Optional[str]: + """Return groupId of the default group for a feature instance; create if needed.""" try: - allFolders = chatService.interfaceDbComponent.listFolders() - tempFolder = next( - (f for f in allFolders - if f.get("name") == "Temp" and not f.get("parentId")), - None, - ) - if tempFolder: - return tempFolder.get("id") - newFolder = chatService.interfaceDbComponent.createFolder("Temp", parentId=None) - return newFolder.get("id") if newFolder else None + existing = appInterface.getTableGrouping(contextKey) + nodes = [ + n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) + for n in (existing.rootGroups if existing else []) + ] + + def _find(nds): + for nd in nds: + meta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {}) + if (meta or {}).get("featureInstanceId") == featureInstanceId: + return nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + found = _find(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])) + if found: + return found + return None + + found = _find(nodes) + if found: + return found + newId = str(uuid.uuid4()) + nodes.append({"id": newId, "name": featureInstanceId, "itemIds": [], "subGroups": [], "meta": {"featureInstanceId": featureInstanceId}}) + appInterface.upsertTableGrouping(contextKey, nodes) + return newId except Exception as e: - logger.warning(f"Could not get/create Temp folder: {e}") + logger.error(f"_getOrCreateInstanceGroup: {e}") return None +async def _getOrCreateTempGroup( + appInterface, + sessionId: str, + contextKey: str = "files/list", +) -> Optional[str]: + """Return groupId of a temporary group for a session; create if needed.""" + return await _getOrCreateInstanceGroup(appInterface, f"_temp_{sessionId}", contextKey) + + def _attachFileAsChatDocument( services: Any, fileItem: Any, diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py index 9a6af658..3b9f5945 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py @@ -11,10 +11,9 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr from modules.serviceCenter.services.serviceAgent.coreTools._helpers import ( _attachFileAsChatDocument, _formatToolFileResult, - _getOrCreateInstanceFolder, - _getOrCreateTempFolder, + _getOrCreateInstanceGroup, + _getOrCreateTempGroup, _looksLikeBinary, - _resolveFileScope, _MAX_TOOL_RESULT_CHARS, ) @@ -50,6 +49,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): return ToolResult(toolCallId="", toolName="readFile", success=False, error="fileId is required") try: knowledgeService = services.getService("knowledge") if hasattr(services, "getService") else None + fileStatus = None # 1) Knowledge Store: return already-extracted text chunks if knowledgeService: @@ -77,7 +77,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): data=f"[File {fileId} is currently being processed (status: {fileStatus}). Try again shortly.]", ) - # 2) Not indexed yet: try on-demand extraction + # 2) Not indexed yet: inspect file type to decide how to serve the agent + # (binary -> instruct agent to wait / re-upload; text -> decode raw bytes inline) chatService = services.chat fileInfo = chatService.getFileInfo(fileId) if not fileInfo: @@ -100,83 +101,14 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): isBinary = _looksLikeBinary(rawBytes) if isBinary: - try: - from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry, ChunkerRegistry - from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction - from modules.datamodels.datamodelExtraction import ExtractionOptions - - extracted = runExtraction( - ExtractorRegistry(), ChunkerRegistry(), - rawBytes, fileName, mimeType, ExtractionOptions(), - ) - - contentObjects = [] - for part in extracted.parts: - tg = (part.typeGroup or "").lower() - ct = "image" if tg == "image" else "text" - if not part.data or not part.data.strip(): - continue - contentObjects.append({ - "contentObjectId": part.id, - "contentType": ct, - "data": part.data, - "contextRef": { - "containerPath": fileName, - "location": part.label or "file", - **(part.metadata or {}), - }, - }) - - if contentObjects: - if knowledgeService: - try: - userId = context.get("userId", "") - _fiId, _mId = _resolveFileScope(fileId, context) - await knowledgeService.indexFile( - fileId=fileId, fileName=fileName, mimeType=mimeType, - userId=userId, contentObjects=contentObjects, - featureInstanceId=_fiId, - mandateId=_mId, - ) - except Exception as e: - logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}") - - joined = "" - if knowledgeService: - _chunks = knowledgeService._knowledgeDb.getContentChunks(fileId) - _textChunks = [ - c for c in (_chunks or []) - if c.get("contentType") != "image" and c.get("data") - ] - if _textChunks: - joined = "\n\n".join(c["data"] for c in _textChunks) - if not joined: - textParts = [o["data"] for o in contentObjects if o["contentType"] != "image"] - joined = "\n\n".join(textParts) if textParts else "" - if joined: - chunked = _applyOffsetLimit(joined, offset, limit) - if chunked is not None: - return ToolResult(toolCallId="", toolName="readFile", success=True, data=chunked) - if len(joined) > _MAX_TOOL_RESULT_CHARS: - joined = joined[:_MAX_TOOL_RESULT_CHARS] + f"\n\n[Truncated – showing first {_MAX_TOOL_RESULT_CHARS} chars of {len(joined)}. Use offset/limit to read specific sections.]" - return ToolResult( - toolCallId="", toolName="readFile", success=True, - data=joined, - ) - imgCount = sum(1 for o in contentObjects if o["contentType"] == "image") - return ToolResult( - toolCallId="", toolName="readFile", success=True, - data=f"[Extracted {len(contentObjects)} content objects from '{fileName}' " - f"({imgCount} images, no readable text). " - f"Use describeImage(fileId='{fileId}') to analyze visual content.]", - ) - except Exception as extractErr: - logger.warning(f"readFile extraction failed for {fileId} ({fileName}): {extractErr}") - return ToolResult( toolCallId="", toolName="readFile", success=True, - data=f"[Binary file: '{fileName}', type={mimeType}, size={len(rawBytes)} bytes. " - f"Text extraction not available. Use describeImage for images.]", + data=( + f"[File '{fileName}' ({mimeType}) is not yet indexed " + f"(status: {fileStatus or 'unknown'}). Indexing runs automatically " + f"on upload. Please wait a few seconds and retry, or re-upload the file. " + f"For visual content use describeImage(fileId='{fileId}').]" + ), ) # 3) Text file: decode raw bytes @@ -237,7 +169,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): try: chatService = services.chat files = chatService.listFiles( - folderId=args.get("folderId"), tags=args.get("tags"), search=args.get("search"), ) @@ -290,18 +221,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): except Exception as e: return ToolResult(toolCallId="", toolName="searchInFileContent", success=False, error=str(e)) - async def _listFolders(args: Dict[str, Any], context: Dict[str, Any]): - try: - chatService = services.chat - folders = chatService.listFolders(parentId=args.get("parentId")) - folderList = "\n".join( - f"- {f.get('name', 'unnamed')} (id: {f.get('id', '?')})" - for f in folders - ) if folders else "No folders found." - return ToolResult(toolCallId="", toolName="listFolders", success=True, data=folderList) - except Exception as e: - return ToolResult(toolCallId="", toolName="listFolders", success=False, error=str(e)) - async def _webSearch(args: Dict[str, Any], context: Dict[str, Any]): query = args.get("query", "") if not query: @@ -339,35 +258,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): except Exception as e: return ToolResult(toolCallId="", toolName="tagFile", success=False, error=str(e)) - async def _moveFile(args: Dict[str, Any], context: Dict[str, Any]): - fileId = args.get("fileId", "") - targetFolderId = args.get("targetFolderId") - if not fileId: - return ToolResult(toolCallId="", toolName="moveFile", success=False, error="fileId is required") - try: - chatService = services.chat - chatService.interfaceDbComponent.updateFile(fileId, {"folderId": targetFolderId}) - return ToolResult( - toolCallId="", toolName="moveFile", success=True, - data=f"File {fileId} moved to folder {targetFolderId or 'root'}" - ) - except Exception as e: - return ToolResult(toolCallId="", toolName="moveFile", success=False, error=str(e)) - - async def _createFolder(args: Dict[str, Any], context: Dict[str, Any]): - name = args.get("name", "") - if not name: - return ToolResult(toolCallId="", toolName="createFolder", success=False, error="name is required") - try: - chatService = services.chat - folder = chatService.createFolder(name=name, parentId=args.get("parentId")) - return ToolResult( - toolCallId="", toolName="createFolder", success=True, - data=f"Folder '{name}' created (id: {folder.get('id', '?')})" - ) - except Exception as e: - return ToolResult(toolCallId="", toolName="createFolder", success=False, error=str(e)) - async def _writeFile(args: Dict[str, Any], context: Dict[str, Any]): content = args.get("content", "") mode = args.get("mode", "create") @@ -422,12 +312,52 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): fiId = context.get("featureInstanceId") or (services.featureInstanceId if services else "") if fiId: dbMgmt.updateFile(fileItem.id, {"featureInstanceId": fiId}) - if args.get("folderId"): - dbMgmt.updateFile(fileItem.id, {"folderId": args["folderId"]}) + if args.get("groupId"): + try: + appIface = chatService.interfaceDbApp + existing = appIface.getTableGrouping("files/list") + nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])] + def _addToGroup(nds, gid, fid): + for nd in nds: + nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + if nid == gid: + ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", [])) + if fid not in ids: + ids.append(fid) + if isinstance(nd, dict): + nd["itemIds"] = ids + return True + if _addToGroup(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid): + return True + return False + _addToGroup(nodes, args["groupId"], fileItem.id) + appIface.upsertTableGrouping("files/list", nodes) + except Exception as _ge: + logger.warning(f"writeFile: failed to add file to group {args['groupId']}: {_ge}") elif fiId: - instanceFolderId = _getOrCreateInstanceFolder(chatService, fiId, context.get("mandateId", "")) - if instanceFolderId: - dbMgmt.updateFile(fileItem.id, {"folderId": instanceFolderId}) + try: + appIface = chatService.interfaceDbApp + instanceGroupId = await _getOrCreateInstanceGroup(appIface, fiId) + if instanceGroupId: + existing = appIface.getTableGrouping("files/list") + nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])] + def _addToGroup2(nds, gid, fid): + for nd in nds: + nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + if nid == gid: + ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", [])) + if fid not in ids: + ids.append(fid) + if isinstance(nd, dict): + nd["itemIds"] = ids + return True + if _addToGroup2(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid): + return True + return False + _addToGroup2(nodes, instanceGroupId, fileItem.id) + appIface.upsertTableGrouping("files/list", nodes) + except Exception as _ge: + logger.warning(f"writeFile: failed to add file to instance group for {fiId}: {_ge}") if args.get("tags"): dbMgmt.updateFile(fileItem.id, {"tags": args["tags"]}) @@ -480,13 +410,13 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): registry.register( "listFiles", _listFiles, description=( - "List files in the local workspace. Filter by folder, tags, or search term. " + "List files in the local workspace. Filter by tags or search term. " + "To filter by group, use listItemsInGroup. " "For external data sources, use browseDataSource instead." ), parameters={ "type": "object", "properties": { - "folderId": {"type": "string", "description": "Filter by folder ID"}, "tags": {"type": "array", "items": {"type": "string"}, "description": "Filter by tags (any match)"}, "search": {"type": "string", "description": "Search in file names and descriptions"}, } @@ -513,18 +443,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): readOnly=True ) - registry.register( - "listFolders", _listFolders, - description="List folders in the local workspace. For external data sources, use browseDataSource instead.", - parameters={ - "type": "object", - "properties": { - "parentId": {"type": "string", "description": "Parent folder ID (omit for root)"}, - } - }, - readOnly=True - ) - registry.register( "webSearch", _webSearch, description="Search the web for general information. Use readUrl to fetch content from a known URL instead.", @@ -550,34 +468,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): readOnly=False ) - registry.register( - "moveFile", _moveFile, - description="Move a file to a different folder in the local workspace.", - parameters={ - "type": "object", - "properties": { - "fileId": {"type": "string", "description": "The file ID to move"}, - "targetFolderId": {"type": "string", "description": "Target folder ID (null for root)"}, - }, - "required": ["fileId"] - }, - readOnly=False - ) - - registry.register( - "createFolder", _createFolder, - description="Create a new folder in the local workspace.", - parameters={ - "type": "object", - "properties": { - "name": {"type": "string", "description": "Folder name"}, - "parentId": {"type": "string", "description": "Parent folder ID (omit for root)"}, - }, - "required": ["name"] - }, - readOnly=False - ) - registry.register( "writeFile", _writeFile, description=( @@ -598,7 +488,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): "content": {"type": "string", "description": "Content to write/append"}, "mode": {"type": "string", "enum": ["create", "append", "overwrite"], "description": "Write mode (default: create)"}, "fileId": {"type": "string", "description": "File ID (required for mode=append/overwrite)"}, - "folderId": {"type": "string", "description": "Target folder ID (mode=create only)"}, + "groupId": {"type": "string", "description": "Group ID to place the file in (mode=create only). Omit to use the instance default group."}, "tags": {"type": "array", "items": {"type": "string"}, "description": "Tags (mode=create only)"}, }, "required": ["content"] @@ -758,55 +648,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): readOnly=True ) - # ---- Phase 2: deleteFolder, renameFolder, moveFolder, copyFile, editFile ---- - - async def _deleteFolder(args: Dict[str, Any], context: Dict[str, Any]): - folderId = args.get("folderId", "") - recursive = args.get("recursive", False) - if not folderId: - return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error="folderId is required") - try: - chatService = services.chat - result = chatService.interfaceDbComponent.deleteFolder(folderId, recursive=recursive) - summary = f"Deleted {result.get('deletedFolders', 1)} folder(s) and {result.get('deletedFiles', 0)} file(s)" - return ToolResult( - toolCallId="", toolName="deleteFolder", success=True, data=summary, - sideEvents=[{"type": "folderDeleted", "data": {"folderId": folderId, **result}}], - ) - except Exception as e: - return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error=str(e)) - - async def _renameFolder(args: Dict[str, Any], context: Dict[str, Any]): - folderId = args.get("folderId", "") - newName = args.get("newName", "") - if not folderId or not newName: - return ToolResult(toolCallId="", toolName="renameFolder", success=False, error="folderId and newName are required") - try: - chatService = services.chat - chatService.interfaceDbComponent.renameFolder(folderId, newName) - return ToolResult( - toolCallId="", toolName="renameFolder", success=True, - data=f"Folder {folderId} renamed to '{newName}'", - sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "name": newName}}], - ) - except Exception as e: - return ToolResult(toolCallId="", toolName="renameFolder", success=False, error=str(e)) - - async def _moveFolder(args: Dict[str, Any], context: Dict[str, Any]): - folderId = args.get("folderId", "") - targetParentId = args.get("targetParentId") - if not folderId: - return ToolResult(toolCallId="", toolName="moveFolder", success=False, error="folderId is required") - try: - chatService = services.chat - chatService.interfaceDbComponent.moveFolder(folderId, targetParentId) - return ToolResult( - toolCallId="", toolName="moveFolder", success=True, - data=f"Folder {folderId} moved to {targetParentId or 'root'}", - sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "parentId": targetParentId}}], - ) - except Exception as e: - return ToolResult(toolCallId="", toolName="moveFolder", success=False, error=str(e)) + # ---- Phase 2: copyFile, editFile ---- async def _copyFile(args: Dict[str, Any], context: Dict[str, Any]): fileId = args.get("fileId", "") @@ -816,7 +658,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): chatService = services.chat copiedFile = chatService.interfaceDbComponent.copyFile( fileId, - targetFolderId=args.get("targetFolderId"), newFileName=args.get("newFileName"), ) return ToolResult( @@ -891,48 +732,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): except Exception as e: return ToolResult(toolCallId="", toolName="replaceInFile", success=False, error=str(e)) - registry.register( - "deleteFolder", _deleteFolder, - description="Delete a folder from the local workspace. Set recursive=true to delete all contents.", - parameters={ - "type": "object", - "properties": { - "folderId": {"type": "string", "description": "The folder ID to delete"}, - "recursive": {"type": "boolean", "description": "If true, delete folder and all contents (files and subfolders). Default: false"}, - }, - "required": ["folderId"] - }, - readOnly=False - ) - - registry.register( - "renameFolder", _renameFolder, - description="Rename a folder in the local workspace.", - parameters={ - "type": "object", - "properties": { - "folderId": {"type": "string", "description": "The folder ID to rename"}, - "newName": {"type": "string", "description": "New folder name"}, - }, - "required": ["folderId", "newName"] - }, - readOnly=False - ) - - registry.register( - "moveFolder", _moveFolder, - description="Move a folder to a different parent in the local workspace.", - parameters={ - "type": "object", - "properties": { - "folderId": {"type": "string", "description": "The folder ID to move"}, - "targetParentId": {"type": "string", "description": "Target parent folder ID (null/omit for root)"}, - }, - "required": ["folderId"] - }, - readOnly=False - ) - registry.register( "copyFile", _copyFile, description="Create an independent copy of a file in the local workspace.", @@ -940,7 +739,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): "type": "object", "properties": { "fileId": {"type": "string", "description": "The file ID to copy"}, - "targetFolderId": {"type": "string", "description": "Target folder for the copy (default: same folder)"}, "newFileName": {"type": "string", "description": "New file name (default: same name, auto-numbered if duplicate)"}, }, "required": ["fileId"] @@ -948,6 +746,137 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): readOnly=False ) + # ---- Group tools (replaces folder-based tools) ---- + + async def _listGroups(args: Dict[str, Any], context: Dict[str, Any]): + contextKey = args.get("contextKey", "files/list") + try: + chatService = services.chat + appInterface = chatService.interfaceDbApp + existing = appInterface.getTableGrouping(contextKey) + if not existing: + return ToolResult(toolCallId="", toolName="listGroups", success=True, data="No groups found.") + + def _flatten(nodes, depth=0): + result = [] + for n in nodes: + nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) + result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))}) + result.extend(_flatten(nd.get("subGroups", []), depth + 1)) + return result + + groups = _flatten(existing.rootGroups) + lines = "\n".join( + f"{' ' * g['depth']}- {g['name']} (id: {g['id']}, items: {g['itemCount']})" + for g in groups + ) if groups else "No groups found." + return ToolResult(toolCallId="", toolName="listGroups", success=True, data=lines) + except Exception as e: + return ToolResult(toolCallId="", toolName="listGroups", success=False, error=str(e)) + + async def _listItemsInGroup(args: Dict[str, Any], context: Dict[str, Any]): + groupId = args.get("groupId", "") + contextKey = args.get("contextKey", "files/list") + if not groupId: + return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error="groupId is required") + try: + from modules.routes.routeHelpers import _collectItemIds + chatService = services.chat + appInterface = chatService.interfaceDbApp + existing = appInterface.getTableGrouping(contextKey) + if not existing: + return ToolResult(toolCallId="", toolName="listItemsInGroup", success=True, data="No groups found.") + nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups] + ids = _collectItemIds(nodes, groupId) + itemList = list(ids) if ids else [] + return ToolResult( + toolCallId="", toolName="listItemsInGroup", success=True, + data="\n".join(f"- {fid}" for fid in itemList) if itemList else "No items in group.", + ) + except Exception as e: + return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error=str(e)) + + async def _addItemsToGroup(args: Dict[str, Any], context: Dict[str, Any]): + groupId = args.get("groupId", "") + itemIds = args.get("itemIds", []) + contextKey = args.get("contextKey", "files/list") + if not groupId: + return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="groupId is required") + if not itemIds: + return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="itemIds is required") + try: + chatService = services.chat + appInterface = chatService.interfaceDbApp + existing = appInterface.getTableGrouping(contextKey) + nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])] + + def _add(nds): + for nd in nds: + nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + if nid == groupId: + existing_ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", [])) + for fid in itemIds: + if fid not in existing_ids: + existing_ids.append(fid) + if isinstance(nd, dict): + nd["itemIds"] = existing_ids + return True + if _add(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])): + return True + return False + + found = _add(nodes) + if not found: + return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=f"Group {groupId} not found") + appInterface.upsertTableGrouping(contextKey, nodes) + return ToolResult( + toolCallId="", toolName="addItemsToGroup", success=True, + data=f"Added {len(itemIds)} item(s) to group {groupId}", + ) + except Exception as e: + return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=str(e)) + + registry.register( + "listGroups", _listGroups, + description="List all groups in the file grouping tree. Groups replace folders for organising files.", + parameters={ + "type": "object", + "properties": { + "contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"}, + } + }, + readOnly=True + ) + + registry.register( + "listItemsInGroup", _listItemsInGroup, + description="List all file IDs assigned to a specific group (includes sub-groups recursively).", + parameters={ + "type": "object", + "properties": { + "groupId": {"type": "string", "description": "The group ID to inspect"}, + "contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"}, + }, + "required": ["groupId"] + }, + readOnly=True + ) + + registry.register( + "addItemsToGroup", _addItemsToGroup, + description="Add one or more file IDs to an existing group.", + parameters={ + "type": "object", + "properties": { + "groupId": {"type": "string", "description": "The group ID to add files to"}, + "itemIds": {"type": "array", "items": {"type": "string"}, "description": "List of file IDs to add"}, + "contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"}, + }, + "required": ["groupId", "itemIds"] + }, + readOnly=False + ) + registry.register( "replaceInFile", _replaceInFile, description=( diff --git a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py index fdf172aa..372ec5b2 100644 --- a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py +++ b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py @@ -268,24 +268,19 @@ class AgentService: info = chatService.getFileInfo(fid) if not info: - folderInfo = chatService.interfaceDbComponent.getFolder(fid) - if folderInfo: - folderName = folderInfo.get("name", fid) - folderFiles = chatService.listFiles(folderId=fid) - desc = f"### Folder: {folderName}\n - id: {fid}\n - type: folder\n - contains: {len(folderFiles)} file(s)" - if folderFiles: - desc += "\n - files:" - for ff in folderFiles[:30]: - ffName = ff.get("fileName", "?") - ffId = ff.get("id", "?") - ffMime = ff.get("mimeType", "?") - ffSize = ff.get("fileSize", ff.get("size", "?")) - desc += f"\n * {ffName} (id: {ffId}, type: {ffMime}, size: {ffSize} bytes)" - if len(folderFiles) > 30: - desc += f"\n ... and {len(folderFiles) - 30} more files" - desc += f'\nUse `listFiles(folderId="{fid}")` to get the full file list, then `readFile(fileId)` to read individual files.' - fileDescriptions.append(desc) - continue + # Check if fid is a group ID + try: + groupFileIds = chatService.listFilesInGroup(fid) + if groupFileIds: + allGroups = chatService.listGroups() + groupInfo = next((g for g in allGroups if g.get("id") == fid), None) + groupName = groupInfo.get("name", fid) if groupInfo else fid + desc = f"### Group: {groupName}\n - id: {fid}\n - type: group\n - contains: {len(groupFileIds)} file(s)" + desc += f'\nUse `listItemsInGroup(groupId="{fid}")` to get file IDs, then `readFile(fileId)` to read each.' + fileDescriptions.append(desc) + continue + except Exception: + pass fileDescriptions.append(f"### File id: {fid}") continue @@ -333,7 +328,7 @@ class AgentService: "These files/folders have been uploaded and processed through the extraction pipeline.\n" "Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, " "or `describeImage(fileId)` for image analysis.\n" - "For folders, use `listFiles(folderId)` to get the files inside, then `readFile(fileId)` for each.\n" + "For groups, use `listItemsInGroup(groupId)` to get the file IDs inside, then `readFile(fileId)` for each.\n" "For large PDFs/DOCX, avoid huge `renderDocument` tool JSON: build markdown with " "`writeFile` (create + append), then `renderDocument(sourceFileId=that file id, outputFormat=...)`.\n" "For small docs you may pass `content` inline. Embed images with `![alt](file:fileId)` in markdown.\n\n" diff --git a/modules/serviceCenter/services/serviceAi/mainServiceAi.py b/modules/serviceCenter/services/serviceAi/mainServiceAi.py index 3b800fb5..bcdb9552 100644 --- a/modules/serviceCenter/services/serviceAi/mainServiceAi.py +++ b/modules/serviceCenter/services/serviceAi/mainServiceAi.py @@ -168,12 +168,29 @@ class AiService: # SPEECH_TEAMS: Dedicated pipeline, bypasses standard model selection if request.options and request.options.operationType == OperationTypeEnum.SPEECH_TEAMS: return await self._handleSpeechTeams(request) - - # FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check) - self._preflightBillingCheck() - - # Balance & provider permission checks - await self._checkBillingBeforeAiCall() + + _opType = request.options.operationType if request.options else None + _isNeutralizationCall = _opType in ( + OperationTypeEnum.NEUTRALIZATION_TEXT, + OperationTypeEnum.NEUTRALIZATION_IMAGE, + ) + + if not _isNeutralizationCall: + # FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check) + self._preflightBillingCheck() + # Balance & provider permission checks + await self._checkBillingBeforeAiCall() + else: + # Neutralization calls are system-level operations (connector anonymization). + # They run without a mandate context (e.g. personal-scope connections) and + # are billed the same way as embedding calls: best-effort, skipped when no + # billing settings exist for an empty mandate. + logger.debug( + "callAi: skipping billing preflight for neutralization call " + "(operationType=%s, user=%s)", + _opType, + getattr(getattr(self.services, 'user', None), 'id', 'unknown'), + ) # Calculate effective allowedProviders: RBAC ∩ Workflow effectiveProviders = self._calculateEffectiveProviders() @@ -227,8 +244,15 @@ class AiService: Rehydration happens on the final AiCallResponse (not on individual str deltas). """ await self.ensureAiObjectsInitialized() - self._preflightBillingCheck() - await self._checkBillingBeforeAiCall() + + _streamOpType = request.options.operationType if request.options else None + _isNeutralizationStream = _streamOpType in ( + OperationTypeEnum.NEUTRALIZATION_TEXT, + OperationTypeEnum.NEUTRALIZATION_IMAGE, + ) + if not _isNeutralizationStream: + self._preflightBillingCheck() + await self._checkBillingBeforeAiCall() effectiveProviders = self._calculateEffectiveProviders() if effectiveProviders and request.options: diff --git a/modules/serviceCenter/services/serviceChat/mainServiceChat.py b/modules/serviceCenter/services/serviceChat/mainServiceChat.py index 077596b8..0e69344a 100644 --- a/modules/serviceCenter/services/serviceChat/mainServiceChat.py +++ b/modules/serviceCenter/services/serviceChat/mainServiceChat.py @@ -413,7 +413,7 @@ class ChatService: return None def getFileInfo(self, fileId: str) -> Dict[str, Any]: - """Get file information including new fields (tags, folderId, description, status).""" + """Get file information including new fields (tags, description, status).""" fileItem = self.interfaceDbComponent.getFile(fileId) if fileItem: return { @@ -424,7 +424,6 @@ class ChatService: "fileHash": fileItem.fileHash, "creationDate": fileItem.sysCreatedAt, "tags": getattr(fileItem, "tags", None), - "folderId": getattr(fileItem, "folderId", None), "description": getattr(fileItem, "description", None), "status": getattr(fileItem, "status", None), } @@ -443,14 +442,12 @@ class ChatService: def listFiles( self, - folderId: str = None, tags: List[str] = None, search: str = None, ) -> List[Dict[str, Any]]: """List files for the current user with optional filters. Args: - folderId: Filter by folder (None = root / all). tags: Filter by tags (any match). search: Search in fileName and description. @@ -463,10 +460,6 @@ class ChatService: allFiles = self.interfaceDbComponent.getAllFiles() results = [] for fileItem in allFiles: - if folderId is not None: - if fileItem.get("folderId") != folderId: - continue - if tags: itemTags = fileItem.get("tags") or [] if not any(t in itemTags for t in tags): @@ -486,27 +479,40 @@ class ChatService: "fileSize": fileItem.get("fileSize"), "creationDate": fileItem.get("sysCreatedAt"), "tags": fileItem.get("tags"), - "folderId": fileItem.get("folderId"), "description": fileItem.get("description"), "status": fileItem.get("status"), }) return results - def listFolders(self, parentId: str = None) -> List[Dict[str, Any]]: - """List file folders for the current user. + def listGroups(self, contextKey: str = "files/list") -> list: + """List all groups in the groupTree for the current context.""" + try: + existing = self.interfaceDbApp.getTableGrouping(contextKey) + if not existing: + return [] + def _flatten(nodes, depth=0): + result = [] + for n in nodes: + nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) + result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))}) + result.extend(_flatten(nd.get("subGroups", []), depth + 1)) + return result + return _flatten(existing.rootGroups) + except Exception as e: + return [] - Args: - parentId: Optional parent folder ID to filter by. - None = return ALL folders (for tree building). - - Returns: - List of folder dicts. - """ - return self.interfaceDbComponent.listFolders(parentId=parentId) - - def createFolder(self, name: str, parentId: str = None) -> Dict[str, Any]: - """Create a new file folder with unique name validation.""" - return self.interfaceDbComponent.createFolder(name=name, parentId=parentId) + def listFilesInGroup(self, groupId: str, contextKey: str = "files/list") -> list: + """List file IDs in a specific group (recursive).""" + try: + from modules.routes.routeHelpers import _collectItemIds + existing = self.interfaceDbApp.getTableGrouping(contextKey) + if not existing: + return [] + nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups] + ids = _collectItemIds(nodes, groupId) + return list(ids) if ids else [] + except Exception: + return [] # ---- DataSource CRUD ---- diff --git a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py index dab8cc25..6698e164 100644 --- a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py +++ b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py @@ -2,9 +2,13 @@ # All rights reserved. """Knowledge service: 3-tier RAG with indexing, semantic search, and context building.""" +import hashlib +import json import logging import re -from typing import Any, Callable, Dict, List, Optional +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Union from modules.datamodels.datamodelKnowledge import ( FileContentIndex, ContentChunk, WorkflowMemory, @@ -20,6 +24,68 @@ DEFAULT_CHUNK_TOKENS = 400 DEFAULT_CONTEXT_BUDGET = 12000 +# ============================================================================= +# Ingestion façade (P0 of unified-knowledge-indexing concept) +# ============================================================================= + +@dataclass +class IngestionJob: + """One request to add or refresh content in the unified knowledge store. + + Callers from any lane (routes, feature hooks, agent tools, connector sync) + describe the work they want done via this object; idempotency, scope + resolution, and embedding are handled by KnowledgeService.requestIngestion. + """ + sourceKind: str + sourceId: str + fileName: str + mimeType: str + userId: str + contentObjects: List[Dict[str, Any]] = field(default_factory=list) + featureInstanceId: str = "" + mandateId: str = "" + structure: Optional[Dict[str, Any]] = None + containerPath: Optional[str] = None + contentVersion: Optional[str] = None + provenance: Optional[Dict[str, Any]] = None + # Connector-driven neutralization: True when the user opted in via §2.6 preferences. + # For sourceKind == "file", _indexFileInternal resolves this from FileItem.neutralize instead. + neutralize: bool = False + + +@dataclass +class IngestionHandle: + """Result of requestIngestion. Stable across in-process and future queue impls.""" + jobId: str + status: str + contentHash: str + fileId: str + index: Optional[FileContentIndex] = None + error: Optional[str] = None + + +def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str: + """Deterministic SHA256 over (contentType, data) tuples in extractor order. + + `contentObjectId` is intentionally excluded because extractors generate + fresh UUIDs per run (`uuid.uuid4()`), which would make the hash unstable + across re-extractions of the same source — defeating idempotency. + Order is preserved (no sort) because two different documents can share the + same multiset of parts but differ in arrangement (e.g. swapped pages). + Text whitespace is preserved intentionally because chunk boundaries + depend on it. + """ + normalized = [ + ( + str(o.get("contentType", "text") or "text"), + o.get("data", "") or "", + ) + for o in (contentObjects or []) + ] + payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":")) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + class KnowledgeService: """Service for Knowledge Store operations: indexing, retrieval, and context building.""" @@ -46,6 +112,224 @@ class KnowledgeService: results = await self._embed([text]) return results[0] if results else [] + # ========================================================================= + # Ingestion façade (single entry point for all lanes) + # ========================================================================= + + async def requestIngestion(self, job: IngestionJob) -> IngestionHandle: + """Unified entry point for filling the knowledge corpus. + + Applies idempotency based on a content hash (or caller-supplied + `contentVersion`) persisted in `FileContentIndex.structure._ingestion`. + Re-runs indexing only when the hash differs or the previous run did + not reach `indexed` state. Runs embedding synchronously for now + (callers already schedule background tasks where needed). + """ + jobId = f"{job.sourceKind}:{job.sourceId}" + startMs = time.time() + contentHash = job.contentVersion or _computeIngestionHash(job.contentObjects) + + # 1. Check for duplicate via existing FileContentIndex row. + existing = None + try: + existing = self._knowledgeDb.getFileContentIndex(job.sourceId) + except Exception: + existing = None + + if existing: + existingStructure = ( + existing.get("structure") if isinstance(existing, dict) + else getattr(existing, "structure", {}) + ) or {} + existingMeta = existingStructure.get("_ingestion", {}) or {} + existingStatus = ( + existing.get("status") if isinstance(existing, dict) + else getattr(existing, "status", "") + ) or "" + if existingMeta.get("hash") == contentHash and existingStatus == "indexed": + logger.info( + "ingestion.skipped.duplicate sourceKind=%s sourceId=%s hash=%s", + job.sourceKind, job.sourceId, contentHash[:12], + extra={ + "event": "ingestion.skipped.duplicate", + "jobId": jobId, + "sourceKind": job.sourceKind, + "sourceId": job.sourceId, + "hash": contentHash, + "durationMs": int((time.time() - startMs) * 1000), + }, + ) + return IngestionHandle( + jobId=jobId, + status="duplicate", + contentHash=contentHash, + fileId=job.sourceId, + index=None, + ) + + # 2. Prepare ingestion metadata; stays in structure._ingestion so + # later connector revoke/purge can filter chunks by sourceKind / + # provenance.connectionId without a schema migration. + ingestionMeta = { + "hash": contentHash, + "sourceKind": job.sourceKind, + "sourceId": job.sourceId, + "contentVersion": job.contentVersion, + "indexedAt": getUtcTimestamp(), + "provenance": dict(job.provenance or {}), + } + structure = dict(job.structure or {}) + structure["_ingestion"] = ingestionMeta + + logger.info( + "ingestion.queued sourceKind=%s sourceId=%s objects=%d hash=%s", + job.sourceKind, job.sourceId, len(job.contentObjects or []), contentHash[:12], + extra={ + "event": "ingestion.queued", + "jobId": jobId, + "sourceKind": job.sourceKind, + "sourceId": job.sourceId, + "hash": contentHash, + "objectCount": len(job.contentObjects or []), + }, + ) + + # 3. Run real indexing. + try: + index = await self._indexFileInternal( + fileId=job.sourceId, + fileName=job.fileName, + mimeType=job.mimeType, + userId=job.userId, + featureInstanceId=job.featureInstanceId, + mandateId=job.mandateId, + contentObjects=job.contentObjects or [], + structure=structure, + containerPath=job.containerPath, + sourceKind=job.sourceKind, + connectionId=(job.provenance or {}).get("connectionId"), + neutralize=job.neutralize, + ) + except Exception as exc: + logger.error( + "ingestion.failed sourceKind=%s sourceId=%s error=%s", + job.sourceKind, job.sourceId, exc, + exc_info=True, + extra={ + "event": "ingestion.failed", + "jobId": jobId, + "sourceKind": job.sourceKind, + "sourceId": job.sourceId, + "hash": contentHash, + "error": str(exc), + "durationMs": int((time.time() - startMs) * 1000), + }, + ) + try: + self._knowledgeDb.updateFileStatus(job.sourceId, "failed") + except Exception: + pass + return IngestionHandle( + jobId=jobId, + status="failed", + contentHash=contentHash, + fileId=job.sourceId, + index=None, + error=str(exc), + ) + + logger.info( + "ingestion.indexed sourceKind=%s sourceId=%s objects=%d durationMs=%d", + job.sourceKind, job.sourceId, len(job.contentObjects or []), + int((time.time() - startMs) * 1000), + extra={ + "event": "ingestion.indexed", + "jobId": jobId, + "sourceKind": job.sourceKind, + "sourceId": job.sourceId, + "hash": contentHash, + "objectCount": len(job.contentObjects or []), + "durationMs": int((time.time() - startMs) * 1000), + }, + ) + return IngestionHandle( + jobId=jobId, + status="indexed", + contentHash=contentHash, + fileId=job.sourceId, + index=index, + ) + + def purgeConnection(self, connectionId: str) -> Dict[str, int]: + """Delete every FileContentIndex + ContentChunk linked to a UserConnection. + + Called on `connection.revoked` events so the knowledge corpus never + holds chunks the user has withdrawn access to. Returns deletion counts + for observability. + """ + if not connectionId: + return {"indexRows": 0, "chunks": 0} + startMs = time.time() + result = self._knowledgeDb.deleteFileContentIndexByConnectionId(connectionId) + logger.info( + "ingestion.connection.purged connectionId=%s rows=%d chunks=%d durationMs=%d", + connectionId, result["indexRows"], result["chunks"], + int((time.time() - startMs) * 1000), + extra={ + "event": "ingestion.connection.purged", + "connectionId": connectionId, + "indexRows": result["indexRows"], + "chunks": result["chunks"], + "durationMs": int((time.time() - startMs) * 1000), + }, + ) + return result + + def getIngestionStatus( + self, handleOrJobId: Union[IngestionHandle, str] + ) -> Dict[str, Any]: + """Map a handle or `sourceKind:sourceId` jobId to a status snapshot.""" + if isinstance(handleOrJobId, IngestionHandle): + sourceId = handleOrJobId.fileId + jobId = handleOrJobId.jobId + elif isinstance(handleOrJobId, str) and ":" in handleOrJobId: + jobId = handleOrJobId + sourceId = handleOrJobId.split(":", 1)[1] + else: + jobId = str(handleOrJobId) + sourceId = str(handleOrJobId) + + row = None + try: + row = self._knowledgeDb.getFileContentIndex(sourceId) + except Exception: + row = None + if not row: + return { + "jobId": jobId, + "sourceId": sourceId, + "status": "unknown", + "contentHash": None, + } + + structure = ( + row.get("structure") if isinstance(row, dict) + else getattr(row, "structure", {}) + ) or {} + meta = structure.get("_ingestion", {}) or {} + status = ( + row.get("status") if isinstance(row, dict) + else getattr(row, "status", "") + ) or "unknown" + return { + "jobId": jobId, + "sourceId": sourceId, + "status": status, + "contentHash": meta.get("hash"), + "sourceKind": meta.get("sourceKind"), + "indexedAt": meta.get("indexedAt"), + } + # ========================================================================= # File Indexing (called after extraction, before embedding) # ========================================================================= @@ -61,6 +345,57 @@ class KnowledgeService: contentObjects: List[Dict[str, Any]] = None, structure: Dict[str, Any] = None, containerPath: str = None, + ) -> Optional[FileContentIndex]: + """Backward-compatible wrapper delegating to requestIngestion. + + Existing callers that still invoke `indexFile` directly automatically + participate in the idempotency/metrics layer. New callers should + prefer `requestIngestion` so they can pass `sourceKind` and + `provenance` for connector revoke/purge later. + """ + job = IngestionJob( + sourceKind="file", + sourceId=fileId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + featureInstanceId=featureInstanceId, + mandateId=mandateId, + contentObjects=list(contentObjects or []), + structure=structure, + containerPath=containerPath, + ) + handle = await self.requestIngestion(job) + if handle.index is not None: + return handle.index + if handle.status == "duplicate": + row = None + try: + row = self._knowledgeDb.getFileContentIndex(fileId) + except Exception: + row = None + if isinstance(row, dict): + try: + return FileContentIndex(**row) + except Exception: + return None + return row + return None + + async def _indexFileInternal( + self, + fileId: str, + fileName: str, + mimeType: str, + userId: str, + featureInstanceId: str = "", + mandateId: str = "", + contentObjects: List[Dict[str, Any]] = None, + structure: Dict[str, Any] = None, + containerPath: str = None, + sourceKind: str = "file", + connectionId: Optional[str] = None, + neutralize: bool = False, ) -> FileContentIndex: """Index a file's content objects and create embeddings for text chunks. @@ -83,39 +418,41 @@ class KnowledgeService: """ contentObjects = contentObjects or [] - # 1. Resolve scope fields from FileItem (Single Source of Truth) - # FileItem lives in poweron_management; its scope/mandateId/featureInstanceId - # are authoritative and must be mirrored onto the FileContentIndex. + # 1. Resolve scope fields from FileItem (Single Source of Truth) for + # uploaded files. Connector-sourced ingestion (sharepoint_item, + # outlook_message, ...) has no FileItem row — trust the caller's + # scope + ids directly. resolvedScope = "personal" resolvedMandateId = mandateId resolvedFeatureInstanceId = featureInstanceId resolvedUserId = userId - _shouldNeutralize = False - try: - from modules.datamodels.datamodelFiles import FileItem as _FileItem - _dbComponent = getattr(self._context, "interfaceDbComponent", None) - _fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else [] - if not _fileRecords: - from modules.interfaces.interfaceDbManagement import ComponentObjects - _row = ComponentObjects().db._loadRecord(_FileItem, fileId) - if _row: - _fileRecords = [_row] - if _fileRecords: - _fileRecord = _fileRecords[0] - _get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d)) - _shouldNeutralize = bool(_get("neutralize", False)) - _fileScope = _get("scope") - if _fileScope: - resolvedScope = _fileScope - if not resolvedMandateId: - resolvedMandateId = str(_get("mandateId", "") or "") - if not resolvedFeatureInstanceId: - resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "") - _fileCreatedBy = _get("sysCreatedBy") - if _fileCreatedBy: - resolvedUserId = str(_fileCreatedBy) - except Exception: - pass + _shouldNeutralize = neutralize # caller-supplied flag (connector prefs / IngestionJob) + if sourceKind == "file": + try: + from modules.datamodels.datamodelFiles import FileItem as _FileItem + _dbComponent = getattr(self._context, "interfaceDbComponent", None) + _fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else [] + if not _fileRecords: + from modules.interfaces.interfaceDbManagement import ComponentObjects + _row = ComponentObjects().db._loadRecord(_FileItem, fileId) + if _row: + _fileRecords = [_row] + if _fileRecords: + _fileRecord = _fileRecords[0] + _get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d)) + _shouldNeutralize = bool(_get("neutralize", False)) # FileItem is authoritative for uploads + _fileScope = _get("scope") + if _fileScope: + resolvedScope = _fileScope + if not resolvedMandateId: + resolvedMandateId = str(_get("mandateId", "") or "") + if not resolvedFeatureInstanceId: + resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "") + _fileCreatedBy = _get("sysCreatedBy") + if _fileCreatedBy: + resolvedUserId = str(_fileCreatedBy) + except Exception: + pass # 2. Create FileContentIndex with correct scope from the start index = FileContentIndex( @@ -124,6 +461,8 @@ class KnowledgeService: featureInstanceId=resolvedFeatureInstanceId, mandateId=resolvedMandateId, scope=resolvedScope, + sourceKind=sourceKind, + connectionId=connectionId, fileName=fileName, mimeType=mimeType, containerPath=containerPath, @@ -300,7 +639,12 @@ class KnowledgeService: Formatted context string for injection into the agent's system prompt. """ queryVector = await self._embedSingle(currentPrompt) + logger.debug( + "buildAgentContext.start userId=%s featureInstanceId=%s mandateId=%s isSysAdmin=%s prompt=%r", + userId, featureInstanceId, mandateId, isSysAdmin, (currentPrompt or "")[:120], + ) if not queryVector: + logger.debug("buildAgentContext.abort reason=no_query_vector") return "" builder = _ContextBuilder(budget=contextBudget) @@ -327,9 +671,14 @@ class KnowledgeService: featureInstanceId=featureInstanceId, mandateId=mandateId, limit=15, - minScore=0.65, + minScore=0.35, isSysAdmin=isSysAdmin, ) + logger.debug( + "buildAgentContext.layer1 instanceChunks=%d top_scores=%s", + len(instanceChunks), + [round(float(c.get("_score", 0) or 0), 3) for c in (instanceChunks or [])[:3]], + ) if instanceChunks: builder.add(priority=1, label="Relevant Documents", items=instanceChunks, maxChars=4000) @@ -338,7 +687,7 @@ class KnowledgeService: queryVector=queryVector, workflowId=workflowId, limit=10, - minScore=0.55, + minScore=0.35, ) if roundMemories: memItems = [] @@ -376,7 +725,7 @@ class KnowledgeService: scope="mandate", mandateId=mandateId, limit=10, - minScore=0.7, + minScore=0.35, isSysAdmin=isSysAdmin, ) if mandateChunks: @@ -392,7 +741,12 @@ class KnowledgeService: maxChars=500, ) - return builder.build() + _result = builder.build() + logger.debug( + "buildAgentContext.done totalChars=%d userId=%s", + len(_result), userId, + ) + return _result # ========================================================================= # Workflow Memory diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py new file mode 100644 index 00000000..97ac61d5 --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py @@ -0,0 +1,334 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Connection-lifecycle consumer bridging OAuth events to ingestion jobs. + +Subscribes to `connection.established` and `connection.revoked` callbacks +emitted by the OAuth callbacks / connection management routes and dispatches: + +- `connection.established` -> enqueue a `connection.bootstrap` BackgroundJob + that walks the connector and ingests all reachable items via + KnowledgeService.requestIngestion (file-like or virtual documents). +- `connection.revoked` -> run `KnowledgeService.purgeConnection` synchronously + so the knowledge corpus releases the data before the UI confirms the revoke. + +The consumer is registered once at process boot (see `app.py` lifespan). +It intentionally does NOT hold a per-user service context; each callback +creates whatever context it needs from the UserConnection row itself. +""" + +from __future__ import annotations + +import asyncio +import logging +from typing import Any, Dict, Optional + +from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface +from modules.shared.callbackRegistry import callbackRegistry +from modules.serviceCenter.services.serviceBackgroundJobs import ( + registerJobHandler, + startJob, +) + +logger = logging.getLogger(__name__) + +BOOTSTRAP_JOB_TYPE = "connection.bootstrap" + +_registered = False + + +def _onConnectionEstablished( + *, + connectionId: str, + authority: str, + userId: Optional[str] = None, + **kwargs: Any, +) -> None: + """Fire-and-forget bootstrap enqueue for a freshly connected UserConnection.""" + if not connectionId: + logger.warning("connection.established without connectionId; ignoring") + return + payload: Dict[str, Any] = { + "connectionId": connectionId, + "authority": (authority or "").lower(), + "userId": userId, + } + logger.info( + "ingestion.connection.bootstrap.queued connectionId=%s authority=%s", + connectionId, authority, + extra={ + "event": "ingestion.connection.bootstrap.queued", + "connectionId": connectionId, + "authority": authority, + }, + ) + + async def _enqueue() -> None: + try: + await startJob( + BOOTSTRAP_JOB_TYPE, + payload, + triggeredBy=userId, + ) + except Exception as exc: + logger.error( + "ingestion.connection.bootstrap.enqueue_failed connectionId=%s error=%s", + connectionId, exc, exc_info=True, + ) + + try: + loop = asyncio.get_event_loop() + if loop.is_running(): + loop.create_task(_enqueue()) + else: + loop.run_until_complete(_enqueue()) + except RuntimeError: + asyncio.run(_enqueue()) + + +def _onConnectionRevoked( + *, + connectionId: str, + authority: Optional[str] = None, + userId: Optional[str] = None, + reason: Optional[str] = None, + **kwargs: Any, +) -> None: + """Run the knowledge purge synchronously so UI feedback is authoritative.""" + if not connectionId: + logger.warning("connection.revoked without connectionId; ignoring") + return + try: + # Purge lives on the DB interface to avoid ServiceCenter/user-context + # plumbing here; the service method is a thin wrapper on top of this. + result = getKnowledgeInterface(None).deleteFileContentIndexByConnectionId(connectionId) + except Exception as exc: + logger.error( + "ingestion.connection.purged.failed connectionId=%s error=%s", + connectionId, exc, exc_info=True, + ) + return + logger.info( + "ingestion.connection.purged connectionId=%s authority=%s reason=%s rows=%d chunks=%d", + connectionId, authority, reason, + result.get("indexRows", 0), result.get("chunks", 0), + extra={ + "event": "ingestion.connection.purged", + "connectionId": connectionId, + "authority": authority, + "reason": reason, + "indexRows": result.get("indexRows", 0), + "chunks": result.get("chunks", 0), + }, + ) + + +async def _bootstrapJobHandler( + job: Dict[str, Any], + progressCb, +) -> Dict[str, Any]: + """Dispatch bootstrap by authority. Each authority runs its own sub-bootstraps.""" + payload = job.get("payload") or {} + connectionId = payload.get("connectionId") + authority = (payload.get("authority") or "").lower() + if not connectionId: + raise ValueError("connection.bootstrap requires payload.connectionId") + + progressCb(5, f"resolving {authority} connection") + + # Defensive consent check: if the connection has since disabled knowledge ingestion + # (e.g. user toggled setting after the job was enqueued), skip all walkers. + try: + from modules.interfaces.interfaceDbApp import getRootInterface + _root = getRootInterface() + _conn = _root.getUserConnectionById(connectionId) + if _conn and not getattr(_conn, "knowledgeIngestionEnabled", True): + logger.info( + "ingestion.connection.bootstrap.skipped — consent disabled connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.skipped", + "connectionId": connectionId, + "authority": authority, + "reason": "consent_disabled", + }, + ) + return {"connectionId": connectionId, "authority": authority, "skipped": True, "reason": "consent_disabled"} + except Exception as _guardErr: + logger.debug("Could not load connection for consent guard: %s", _guardErr) + + def _normalize(res: Any, label: str) -> Dict[str, Any]: + if isinstance(res, Exception): + logger.error( + "ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s", + label, connectionId, res, exc_info=res, + ) + return {"error": str(res)} + return res or {} + + if authority == "msft": + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import ( + bootstrapSharepoint, + ) + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import ( + bootstrapOutlook, + ) + + progressCb(10, "sharepoint + outlook") + spResult, olResult = await asyncio.gather( + bootstrapSharepoint(connectionId=connectionId, progressCb=progressCb), + bootstrapOutlook(connectionId=connectionId, progressCb=progressCb), + return_exceptions=True, + ) + return { + "connectionId": connectionId, + "authority": authority, + "sharepoint": _normalize(spResult, "sharepoint"), + "outlook": _normalize(olResult, "outlook"), + } + + if authority == "google": + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import ( + bootstrapGdrive, + ) + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import ( + bootstrapGmail, + ) + + progressCb(10, "drive + gmail") + gdResult, gmResult = await asyncio.gather( + bootstrapGdrive(connectionId=connectionId, progressCb=progressCb), + bootstrapGmail(connectionId=connectionId, progressCb=progressCb), + return_exceptions=True, + ) + return { + "connectionId": connectionId, + "authority": authority, + "drive": _normalize(gdResult, "gdrive"), + "gmail": _normalize(gmResult, "gmail"), + } + + if authority == "clickup": + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import ( + bootstrapClickup, + ) + + progressCb(10, "clickup tasks") + cuResult = await bootstrapClickup(connectionId=connectionId, progressCb=progressCb) + return { + "connectionId": connectionId, + "authority": authority, + "clickup": _normalize(cuResult, "clickup"), + } + + logger.info( + "ingestion.connection.bootstrap.skipped reason=unsupported_authority authority=%s connectionId=%s", + authority, connectionId, + extra={ + "event": "ingestion.connection.bootstrap.skipped", + "authority": authority, + "connectionId": connectionId, + "reason": "unsupported_authority", + }, + ) + return { + "connectionId": connectionId, + "authority": authority, + "skipped": True, + "reason": "unsupported_authority", + } + + +async def _scheduledDailyResync() -> None: + """Enqueue a connection.bootstrap job for every active knowledge connection. + + Runs once per day (default 2 AM Europe/Zurich). Each job re-walks the + connector and hands new / changed items to KnowledgeService.requestIngestion. + Unchanged items are deduplicated by content-hash and skipped automatically. + """ + try: + from modules.interfaces.interfaceDbApp import getRootInterface + rootInterface = getRootInterface() + connections = rootInterface.getActiveKnowledgeConnections() + except Exception as exc: + logger.error("knowledge.daily_resync: could not load connections: %s", exc, exc_info=True) + return + + if not connections: + logger.info("knowledge.daily_resync: no active knowledge connections — nothing to do") + return + + logger.info( + "knowledge.daily_resync: enqueuing bootstrap for %d connection(s)", + len(connections), + extra={"event": "knowledge.daily_resync.started", "count": len(connections)}, + ) + + enqueued = 0 + skipped = 0 + for conn in connections: + connectionId = str(conn.id) + authority = conn.authority.value if hasattr(conn.authority, "value") else str(conn.authority) + userId = str(conn.userId) + payload: Dict[str, Any] = { + "connectionId": connectionId, + "authority": authority.lower(), + "userId": userId, + } + try: + await startJob( + BOOTSTRAP_JOB_TYPE, + payload, + triggeredBy="scheduler.daily_resync", + ) + enqueued += 1 + logger.debug( + "knowledge.daily_resync: queued connectionId=%s authority=%s", + connectionId, authority, + ) + except Exception as exc: + skipped += 1 + logger.error( + "knowledge.daily_resync: failed to enqueue connectionId=%s: %s", + connectionId, exc, + ) + + logger.info( + "knowledge.daily_resync: done — enqueued=%d skipped=%d", + enqueued, skipped, + extra={"event": "knowledge.daily_resync.done", "enqueued": enqueued, "skipped": skipped}, + ) + + +def registerDailyResyncScheduler(*, hour: int = 2, minute: int = 0) -> None: + """Register the daily knowledge re-sync cron job. Idempotent. + + Args: + hour: Hour of day to run (0–23, default 2 → 2 AM Europe/Zurich). + minute: Minute within the hour (default 0). + """ + try: + from modules.shared.eventManagement import eventManager + eventManager.registerCron( + jobId="knowledge.daily_resync", + func=_scheduledDailyResync, + cronKwargs={"hour": str(hour), "minute": str(minute)}, + ) + logger.info( + "knowledge.daily_resync scheduler registered (daily %02d:%02d Europe/Zurich)", + hour, minute, + ) + except Exception as exc: + logger.warning("knowledge.daily_resync scheduler registration failed (non-critical): %s", exc) + + +def registerKnowledgeIngestionConsumer() -> None: + """Register callback subscribers + background job handler. Idempotent.""" + global _registered + if _registered: + return + callbackRegistry.register("connection.established", _onConnectionEstablished) + callbackRegistry.register("connection.revoked", _onConnectionRevoked) + registerJobHandler(BOOTSTRAP_JOB_TYPE, _bootstrapJobHandler) + registerDailyResyncScheduler() + _registered = True + logger.info("KnowledgeIngestionConsumer registered (established/revoked + %s handler + daily resync)", BOOTSTRAP_JOB_TYPE) diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorPrefs.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorPrefs.py new file mode 100644 index 00000000..950400ce --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorPrefs.py @@ -0,0 +1,101 @@ +"""Per-connection knowledge ingestion preference helpers. + +Walkers call `loadConnectionPrefs(connectionId)` once at bootstrap start and +receive a `ConnectionIngestionPrefs` dataclass they can pass down into their +inner loops. All fields have safe defaults so walkers stay backward-compatible +with connections that predate the §2.6 preference schema (knowledgePreferences +is None). +""" +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + +_DEFAULT_MAX_AGE_DAYS = 90 +_DEFAULT_MAIL_DEPTH = "full" +_DEFAULT_CLICKUP_SCOPE = "title_description" + + +@dataclass +class ConnectionIngestionPrefs: + """Parsed per-connection preferences for knowledge ingestion walkers.""" + + # PII + neutralizeBeforeEmbed: bool = False + + # Mail (Outlook + Gmail) + mailContentDepth: str = _DEFAULT_MAIL_DEPTH # "metadata" | "snippet" | "full" + mailIndexAttachments: bool = False + + # Files (Drive / SharePoint / OneDrive) + filesIndexBinaries: bool = True + mimeAllowlist: List[str] = field(default_factory=list) # empty = all allowed + + # ClickUp + clickupScope: str = _DEFAULT_CLICKUP_SCOPE # "titles" | "title_description" | "with_comments" + clickupIndexAttachments: bool = False + + # Per-authority surface toggles (default everything on) + gmailEnabled: bool = True + driveEnabled: bool = True + sharepointEnabled: bool = True + outlookEnabled: bool = True + + # Time window + maxAgeDays: int = _DEFAULT_MAX_AGE_DAYS # 0 = no limit + + +def loadConnectionPrefs(connectionId: str) -> ConnectionIngestionPrefs: + """Load and parse per-connection preferences from the database. + + Returns safe defaults for any missing or unparseable values so walkers + never fail due to missing preference data. + """ + try: + from modules.interfaces.interfaceDbApp import getRootInterface + root = getRootInterface() + conn = root.getUserConnectionById(connectionId) + if not conn: + logger.debug("loadConnectionPrefs: connection %s not found, using defaults", connectionId) + return ConnectionIngestionPrefs() + + raw: Optional[Dict[str, Any]] = getattr(conn, "knowledgePreferences", None) + if not raw or not isinstance(raw, dict): + return ConnectionIngestionPrefs() + + def _bool(key: str, default: bool) -> bool: + v = raw.get(key) + return bool(v) if isinstance(v, bool) else default + + def _str(key: str, allowed: List[str], default: str) -> str: + v = raw.get(key) + return v if v in allowed else default + + def _int(key: str, default: int) -> int: + v = raw.get(key) + return int(v) if isinstance(v, int) else default + + surface = raw.get("surfaceToggles") or {} + google_surf = surface.get("google") or {} + msft_surf = surface.get("msft") or {} + + return ConnectionIngestionPrefs( + neutralizeBeforeEmbed=_bool("neutralizeBeforeEmbed", False), + mailContentDepth=_str("mailContentDepth", ["metadata", "snippet", "full"], _DEFAULT_MAIL_DEPTH), + mailIndexAttachments=_bool("mailIndexAttachments", False), + filesIndexBinaries=_bool("filesIndexBinaries", True), + mimeAllowlist=list(raw.get("mimeAllowlist") or []), + clickupScope=_str("clickupScope", ["titles", "title_description", "with_comments"], _DEFAULT_CLICKUP_SCOPE), + clickupIndexAttachments=_bool("clickupIndexAttachments", False), + gmailEnabled=bool(google_surf.get("gmail", True)), + driveEnabled=bool(google_surf.get("drive", True)), + sharepointEnabled=bool(msft_surf.get("sharepoint", True)), + outlookEnabled=bool(msft_surf.get("outlook", True)), + maxAgeDays=_int("maxAgeDays", _DEFAULT_MAX_AGE_DAYS), + ) + except Exception as exc: + logger.warning("loadConnectionPrefs failed for %s, using defaults: %s", connectionId, exc) + return ConnectionIngestionPrefs() diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py new file mode 100644 index 00000000..31ac9687 --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py @@ -0,0 +1,512 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""ClickUp bootstrap for the unified knowledge ingestion lane. + +ClickUp tasks are ingested as *virtual documents* — we never download file +bytes. Each task becomes a `sourceKind="clickup_task"` IngestionJob whose +`contentObjects` carry a summary header (name + status + metadata) and the +task description / text content so retrieval finds them without a live API +call. + +Hierarchy traversal: workspace (team) → spaces → folders / folderless lists → +tasks. We cap the fan-out with `maxWorkspaces` / `maxListsPerWorkspace` / +`maxTasks` and skip tasks older than `maxAgeDays` (default 180 d). + +Idempotency: `date_updated` from the ClickUp task payload is a millisecond +timestamp and strictly monotonic per revision — used as `contentVersion`. +""" + +from __future__ import annotations + +import hashlib +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from typing import Any, Callable, Dict, List, Optional + +logger = logging.getLogger(__name__) + +MAX_TASKS_DEFAULT = 500 +MAX_WORKSPACES_DEFAULT = 3 +MAX_LISTS_PER_WORKSPACE_DEFAULT = 20 +MAX_DESCRIPTION_CHARS_DEFAULT = 8000 +MAX_AGE_DAYS_DEFAULT = 180 + + +@dataclass +class ClickupBootstrapLimits: + maxTasks: int = MAX_TASKS_DEFAULT + maxWorkspaces: int = MAX_WORKSPACES_DEFAULT + maxListsPerWorkspace: int = MAX_LISTS_PER_WORKSPACE_DEFAULT + maxDescriptionChars: int = MAX_DESCRIPTION_CHARS_DEFAULT + # Only ingest tasks updated within the last N days. None disables filter. + maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT + # Include closed/archived tasks if they still meet the recency filter. + # ClickUp `closed` tasks often carry the most useful RAG context + # ("why was this shipped the way it was?"). + includeClosed: bool = True + # Pass-through to IngestionJob.neutralize + neutralize: bool = False + # Content scope: "titles" | "title_description" | "with_comments" + clickupScope: str = "title_description" + + +@dataclass +class ClickupBootstrapResult: + connectionId: str + indexed: int = 0 + skippedDuplicate: int = 0 + skippedPolicy: int = 0 + failed: int = 0 + workspaces: int = 0 + lists: int = 0 + errors: List[str] = field(default_factory=list) + + +def _syntheticTaskId(connectionId: str, taskId: str) -> str: + token = hashlib.sha256(f"{connectionId}:{taskId}".encode("utf-8")).hexdigest()[:16] + return f"cu:{connectionId[:8]}:{token}" + + +def _truncate(value: Any, limit: int) -> str: + text = str(value or "").strip() + if not text: + return "" + if len(text) <= limit: + return text + return text[:limit].rstrip() + "\n[truncated]" + + +def _isRecent(dateUpdatedMs: Any, maxAgeDays: Optional[int]) -> bool: + if not maxAgeDays: + return True + if not dateUpdatedMs: + return True + try: + ts = datetime.fromtimestamp(int(dateUpdatedMs) / 1000.0, tz=timezone.utc) + except Exception: + return True + cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays) + return ts >= cutoff + + +def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -> List[Dict[str, Any]]: + """Header (name/status/metadata) + optional description + text_content. + + `limits.clickupScope` controls how much is embedded: + - "titles": task name + status metadata only + - "title_description": header + description / text_content (default) + - "with_comments": header + description + text_content + (comments themselves are not yet fetched in v1) + """ + name = task.get("name") or f"Task {task.get('id', '')}" + status = ((task.get("status") or {}).get("status")) or "" + assignees = ", ".join( + filter(None, [ + (a.get("username") or a.get("email") or "") + for a in (task.get("assignees") or []) + ]) + ) + tags = ", ".join(filter(None, [t.get("name", "") for t in (task.get("tags") or [])])) + listInfo = task.get("list") or {} + folderInfo = task.get("folder") or {} + spaceInfo = task.get("space") or {} + dueMs = task.get("due_date") + dueIso = "" + if dueMs: + try: + dueIso = datetime.fromtimestamp(int(dueMs) / 1000.0, tz=timezone.utc).strftime("%Y-%m-%d") + except Exception: + dueIso = "" + + headerLines = [ + f"Task: {name}", + f"Status: {status}" if status else "", + f"List: {listInfo.get('name', '')}" if listInfo else "", + f"Folder: {folderInfo.get('name', '')}" if folderInfo else "", + f"Space: {spaceInfo.get('name', '')}" if spaceInfo else "", + f"Assignees: {assignees}" if assignees else "", + f"Tags: {tags}" if tags else "", + f"Due: {dueIso}" if dueIso else "", + f"Url: {task.get('url', '')}" if task.get("url") else "", + ] + header = "\n".join(line for line in headerLines if line) + + parts: List[Dict[str, Any]] = [{ + "contentObjectId": "header", + "contentType": "text", + "data": header, + "contextRef": {"part": "header"}, + }] + + scope = getattr(limits, "clickupScope", "title_description") + if scope in ("title_description", "with_comments"): + description = _truncate(task.get("description"), limits.maxDescriptionChars) + if description: + parts.append({ + "contentObjectId": "description", + "contentType": "text", + "data": description, + "contextRef": {"part": "description"}, + }) + # text_content is ClickUp's rendered-markdown version; include if it adds + # something beyond the plain description (common for bullet lists, checklists). + textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars) + if textContent and textContent != description: + parts.append({ + "contentObjectId": "text_content", + "contentType": "text", + "data": textContent, + "contextRef": {"part": "text_content"}, + }) + return parts + + +async def bootstrapClickup( + connectionId: str, + *, + progressCb: Optional[Callable[[int, Optional[str]], None]] = None, + adapter: Any = None, + connection: Any = None, + knowledgeService: Any = None, + limits: Optional[ClickupBootstrapLimits] = None, +) -> Dict[str, Any]: + """Walk workspaces → lists → tasks and ingest each task as a virtual doc.""" + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + prefs = loadConnectionPrefs(connectionId) + + if not limits: + limits = ClickupBootstrapLimits( + maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, + neutralize=prefs.neutralizeBeforeEmbed, + clickupScope=prefs.clickupScope, + ) + + startMs = time.time() + result = ClickupBootstrapResult(connectionId=connectionId) + + logger.info( + "ingestion.connection.bootstrap.started part=clickup connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.started", + "part": "clickup", + "connectionId": connectionId, + }, + ) + + if adapter is None or knowledgeService is None or connection is None: + adapter, connection, knowledgeService = await _resolveDependencies(connectionId) + + mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" + userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" + + svc = getattr(adapter, "_svc", None) + if svc is None: + result.errors.append("adapter missing _svc instance") + return _finalizeResult(connectionId, result, startMs) + + try: + teamsResp = await svc.getAuthorizedTeams() + except Exception as exc: + logger.error("clickup team discovery failed for %s: %s", connectionId, exc, exc_info=True) + result.errors.append(f"teams: {exc}") + return _finalizeResult(connectionId, result, startMs) + + teams = (teamsResp or {}).get("teams") or [] + for team in teams[: limits.maxWorkspaces]: + if result.indexed + result.skippedDuplicate >= limits.maxTasks: + break + teamId = str(team.get("id", "") or "") + if not teamId: + continue + result.workspaces += 1 + try: + await _walkTeam( + svc=svc, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + team=team, + limits=limits, + result=result, + progressCb=progressCb, + ) + except Exception as exc: + logger.error("clickup team %s walk failed: %s", teamId, exc, exc_info=True) + result.errors.append(f"team({teamId}): {exc}") + + return _finalizeResult(connectionId, result, startMs) + + +async def _resolveDependencies(connectionId: str): + from modules.interfaces.interfaceDbApp import getRootInterface + from modules.auth import TokenManager + from modules.connectors.providerClickup.connectorClickup import ClickupConnector + from modules.serviceCenter import getService + from modules.serviceCenter.context import ServiceCenterContext + from modules.security.rootAccess import getRootUser + + rootInterface = getRootInterface() + connection = rootInterface.getUserConnectionById(connectionId) + if connection is None: + raise ValueError(f"UserConnection not found: {connectionId}") + + token = TokenManager().getFreshToken(connectionId) + if not token or not token.tokenAccess: + raise ValueError(f"No valid token for connection {connectionId}") + + provider = ClickupConnector(connection, token.tokenAccess) + adapter = provider.getServiceAdapter("clickup") + + rootUser = getRootUser() + ctx = ServiceCenterContext( + user=rootUser, + mandate_id=str(getattr(connection, "mandateId", "") or ""), + ) + knowledgeService = getService("knowledge", ctx) + return adapter, connection, knowledgeService + + +async def _walkTeam( + *, + svc, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + team: Dict[str, Any], + limits: ClickupBootstrapLimits, + result: ClickupBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + teamId = str(team.get("id", "") or "") + spacesResp = await svc.getSpaces(teamId) + spaces = (spacesResp or {}).get("spaces") or [] + + listsCollected: List[Dict[str, Any]] = [] + for space in spaces: + if len(listsCollected) >= limits.maxListsPerWorkspace: + break + spaceId = str(space.get("id", "") or "") + if not spaceId: + continue + + # Folderless lists directly under the space + folderless = await svc.getFolderlessLists(spaceId) + for lst in (folderless or {}).get("lists") or []: + if len(listsCollected) >= limits.maxListsPerWorkspace: + break + listsCollected.append({**lst, "_space": space}) + + # Lists inside folders + foldersResp = await svc.getFolders(spaceId) + for folder in (foldersResp or {}).get("folders") or []: + if len(listsCollected) >= limits.maxListsPerWorkspace: + break + folderId = str(folder.get("id", "") or "") + if not folderId: + continue + folderLists = await svc.getListsInFolder(folderId) + for lst in (folderLists or {}).get("lists") or []: + if len(listsCollected) >= limits.maxListsPerWorkspace: + break + listsCollected.append({**lst, "_space": space, "_folder": folder}) + + for lst in listsCollected: + if result.indexed + result.skippedDuplicate >= limits.maxTasks: + return + result.lists += 1 + await _walkList( + svc=svc, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + teamId=teamId, + lst=lst, + limits=limits, + result=result, + progressCb=progressCb, + ) + + +async def _walkList( + *, + svc, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + teamId: str, + lst: Dict[str, Any], + limits: ClickupBootstrapLimits, + result: ClickupBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + listId = str(lst.get("id", "") or "") + if not listId: + return + page = 0 + while result.indexed + result.skippedDuplicate < limits.maxTasks: + resp = await svc.getTasksInList( + listId, + page=page, + include_closed=limits.includeClosed, + subtasks=True, + ) + if isinstance(resp, dict) and resp.get("error"): + logger.warning("clickup tasks list=%s page=%d error: %s", listId, page, resp.get("error")) + result.errors.append(f"list({listId}): {resp.get('error')}") + return + tasks = (resp or {}).get("tasks") or [] + if not tasks: + return + + for task in tasks: + if result.indexed + result.skippedDuplicate >= limits.maxTasks: + return + if not _isRecent(task.get("date_updated"), limits.maxAgeDays): + result.skippedPolicy += 1 + continue + # Inject the list/folder/space metadata we already loaded. + task["list"] = task.get("list") or {"id": listId, "name": lst.get("name")} + task["folder"] = task.get("folder") or lst.get("_folder") or {} + task["space"] = task.get("space") or lst.get("_space") or {} + await _ingestTask( + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + teamId=teamId, + task=task, + limits=limits, + result=result, + progressCb=progressCb, + ) + + if len(tasks) < 100: # ClickUp page-size hint: fewer than 100 => last page + return + page += 1 + + +async def _ingestTask( + *, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + teamId: str, + task: Dict[str, Any], + limits: ClickupBootstrapLimits, + result: ClickupBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + taskId = str(task.get("id", "") or "") + if not taskId: + result.skippedPolicy += 1 + return + revision = str(task.get("date_updated") or task.get("date_created") or "") + name = task.get("name") or f"Task {taskId}" + syntheticId = _syntheticTaskId(connectionId, taskId) + fileName = f"{name[:80].strip() or taskId}.task.json" + + contentObjects = _buildContentObjects(task, limits) + + try: + handle = await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="clickup_task", + sourceId=syntheticId, + fileName=fileName, + mimeType="application/vnd.clickup.task+json", + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + contentVersion=revision or None, + neutralize=limits.neutralize, + provenance={ + "connectionId": connectionId, + "authority": "clickup", + "service": "clickup", + "externalItemId": taskId, + "teamId": teamId, + "listId": ((task.get("list") or {}).get("id")), + "spaceId": ((task.get("space") or {}).get("id")), + "url": task.get("url"), + "status": ((task.get("status") or {}).get("status")), + "tier": limits.clickupScope, + }, + ) + ) + except Exception as exc: + logger.error("clickup ingestion %s failed: %s", taskId, exc, exc_info=True) + result.failed += 1 + result.errors.append(f"ingest({taskId}): {exc}") + return + + if handle.status == "duplicate": + result.skippedDuplicate += 1 + elif handle.status == "indexed": + result.indexed += 1 + else: + result.failed += 1 + + if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: + processed = result.indexed + result.skippedDuplicate + try: + progressCb( + min(90, 10 + int(80 * processed / max(1, limits.maxTasks))), + f"clickup processed={processed}", + ) + except Exception: + pass + logger.info( + "ingestion.connection.bootstrap.progress part=clickup processed=%d skippedDup=%d failed=%d", + processed, result.skippedDuplicate, result.failed, + extra={ + "event": "ingestion.connection.bootstrap.progress", + "part": "clickup", + "connectionId": connectionId, + "processed": processed, + "skippedDup": result.skippedDuplicate, + "failed": result.failed, + }, + ) + + +def _finalizeResult(connectionId: str, result: ClickupBootstrapResult, startMs: float) -> Dict[str, Any]: + durationMs = int((time.time() - startMs) * 1000) + logger.info( + "ingestion.connection.bootstrap.done part=clickup connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d workspaces=%d lists=%d durationMs=%d", + connectionId, + result.indexed, result.skippedDuplicate, result.skippedPolicy, + result.failed, result.workspaces, result.lists, durationMs, + extra={ + "event": "ingestion.connection.bootstrap.done", + "part": "clickup", + "connectionId": connectionId, + "indexed": result.indexed, + "skippedDup": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "workspaces": result.workspaces, + "lists": result.lists, + "durationMs": durationMs, + }, + ) + return { + "connectionId": result.connectionId, + "indexed": result.indexed, + "skippedDuplicate": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "workspaces": result.workspaces, + "lists": result.lists, + "durationMs": durationMs, + "errors": result.errors[:20], + } diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py new file mode 100644 index 00000000..5e4e659b --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py @@ -0,0 +1,443 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Google Drive bootstrap for the unified knowledge ingestion lane. + +Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the +user's *My Drive* tree from the virtual `root` folder, downloads each +file-like item via `DriveAdapter.download` (which handles native Google docs +via export), runs the standard extraction pipeline and routes results through +`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and +`contentVersion = modifiedTime` (monotonic per-revision). +""" + +from __future__ import annotations + +import hashlib +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from typing import Any, Callable, Dict, List, Optional + +from modules.datamodels.datamodelExtraction import ExtractionOptions + +logger = logging.getLogger(__name__) + +MAX_ITEMS_DEFAULT = 500 +MAX_BYTES_DEFAULT = 200 * 1024 * 1024 +MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024 +SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/") +MAX_DEPTH_DEFAULT = 4 +MAX_AGE_DAYS_DEFAULT = 365 + +# Google Drive uses virtual mime-types for folders and non-downloadable assets. +FOLDER_MIME = "application/vnd.google-apps.folder" + + +@dataclass +class GdriveBootstrapLimits: + maxItems: int = MAX_ITEMS_DEFAULT + maxBytes: int = MAX_BYTES_DEFAULT + maxFileSize: int = MAX_FILE_SIZE_DEFAULT + skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT + maxDepth: int = MAX_DEPTH_DEFAULT + # Only ingest files modified within the last N days. None disables filter. + maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT + # Pass-through to IngestionJob.neutralize + neutralize: bool = False + # Whether to skip binary/non-text files + filesIndexBinaries: bool = True + + +@dataclass +class GdriveBootstrapResult: + connectionId: str + indexed: int = 0 + skippedDuplicate: int = 0 + skippedPolicy: int = 0 + failed: int = 0 + bytesProcessed: int = 0 + errors: List[str] = field(default_factory=list) + + +def _syntheticFileId(connectionId: str, externalItemId: str) -> str: + token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16] + return f"gd:{connectionId[:8]}:{token}" + + +def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]: + parts = getattr(extracted, "parts", None) or [] + out: List[Dict[str, Any]] = [] + for part in parts: + data = getattr(part, "data", None) or "" + if not data or not str(data).strip(): + continue + typeGroup = getattr(part, "typeGroup", "text") or "text" + contentType = "text" + if typeGroup == "image": + contentType = "image" + elif typeGroup in ("binary", "container"): + contentType = "other" + out.append({ + "contentObjectId": getattr(part, "id", ""), + "contentType": contentType, + "data": data, + "contextRef": { + "containerPath": fileName, + "location": getattr(part, "label", None) or "file", + **(getattr(part, "metadata", None) or {}), + }, + }) + return out + + +def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool: + if not maxAgeDays: + return True + if not modifiedIso: + # No timestamp -> be permissive (Drive native docs sometimes omit it on export). + return True + try: + # Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both. + ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00")) + except Exception: + return True + cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + return ts >= cutoff + + +async def bootstrapGdrive( + connectionId: str, + *, + progressCb: Optional[Callable[[int, Optional[str]], None]] = None, + adapter: Any = None, + connection: Any = None, + knowledgeService: Any = None, + limits: Optional[GdriveBootstrapLimits] = None, + runExtractionFn: Optional[Callable[..., Any]] = None, +) -> Dict[str, Any]: + """Walk My Drive starting from the virtual root folder.""" + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + prefs = loadConnectionPrefs(connectionId) + + if not limits: + limits = GdriveBootstrapLimits( + maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, + neutralize=prefs.neutralizeBeforeEmbed, + filesIndexBinaries=prefs.filesIndexBinaries, + ) + + startMs = time.time() + result = GdriveBootstrapResult(connectionId=connectionId) + + logger.info( + "ingestion.connection.bootstrap.started part=gdrive connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.started", + "part": "gdrive", + "connectionId": connectionId, + }, + ) + + if adapter is None or knowledgeService is None or connection is None: + adapter, connection, knowledgeService = await _resolveDependencies(connectionId) + if runExtractionFn is None: + from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction + from modules.serviceCenter.services.serviceExtraction.subRegistry import ( + ExtractorRegistry, ChunkerRegistry, + ) + extractorRegistry = ExtractorRegistry() + chunkerRegistry = ChunkerRegistry() + + def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef] + return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options) + + mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" + userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" + + try: + await _walkFolder( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + folderPath="/", # DriveAdapter.browse maps "" / "/" -> "root" + depth=0, + limits=limits, + result=result, + progressCb=progressCb, + ) + except Exception as exc: + logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True) + result.errors.append(f"walk: {exc}") + + return _finalizeResult(connectionId, result, startMs) + + +async def _resolveDependencies(connectionId: str): + from modules.interfaces.interfaceDbApp import getRootInterface + from modules.auth import TokenManager + from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector + from modules.serviceCenter import getService + from modules.serviceCenter.context import ServiceCenterContext + from modules.security.rootAccess import getRootUser + + rootInterface = getRootInterface() + connection = rootInterface.getUserConnectionById(connectionId) + if connection is None: + raise ValueError(f"UserConnection not found: {connectionId}") + + token = TokenManager().getFreshToken(connectionId) + if not token or not token.tokenAccess: + raise ValueError(f"No valid token for connection {connectionId}") + + provider = GoogleConnector(connection, token.tokenAccess) + adapter = provider.getServiceAdapter("drive") + + rootUser = getRootUser() + ctx = ServiceCenterContext( + user=rootUser, + mandate_id=str(getattr(connection, "mandateId", "") or ""), + ) + knowledgeService = getService("knowledge", ctx) + return adapter, connection, knowledgeService + + +async def _walkFolder( + *, + adapter, + knowledgeService, + runExtractionFn, + connectionId: str, + mandateId: str, + userId: str, + folderPath: str, + depth: int, + limits: GdriveBootstrapLimits, + result: GdriveBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + if depth > limits.maxDepth: + return + try: + entries = await adapter.browse(folderPath) + except Exception as exc: + logger.warning("gdrive browse %s failed: %s", folderPath, exc) + result.errors.append(f"browse({folderPath}): {exc}") + return + + for entry in entries: + if result.indexed + result.skippedDuplicate >= limits.maxItems: + return + if result.bytesProcessed >= limits.maxBytes: + return + + entryPath = getattr(entry, "path", "") or "" + metadata = getattr(entry, "metadata", {}) or {} + mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType") + + if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME: + await _walkFolder( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + folderPath=entryPath, + depth=depth + 1, + limits=limits, + result=result, + progressCb=progressCb, + ) + continue + + effectiveMime = mimeType or "application/octet-stream" + if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes): + result.skippedPolicy += 1 + continue + size = int(getattr(entry, "size", 0) or 0) + if size and size > limits.maxFileSize: + result.skippedPolicy += 1 + continue + modifiedTime = metadata.get("modifiedTime") + if not _isRecent(modifiedTime, limits.maxAgeDays): + result.skippedPolicy += 1 + continue + + externalItemId = metadata.get("id") or entryPath + revision = modifiedTime + + await _ingestOne( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + entry=entry, + entryPath=entryPath, + mimeType=effectiveMime, + externalItemId=externalItemId, + revision=revision, + limits=limits, + result=result, + progressCb=progressCb, + ) + + +async def _ingestOne( + *, + adapter, + knowledgeService, + runExtractionFn, + connectionId: str, + mandateId: str, + userId: str, + entry, + entryPath: str, + mimeType: str, + externalItemId: str, + revision: Optional[str], + limits: GdriveBootstrapLimits, + result: GdriveBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + syntheticFileId = _syntheticFileId(connectionId, externalItemId) + fileName = getattr(entry, "name", "") or externalItemId + + try: + downloaded = await adapter.download(entryPath) + except Exception as exc: + logger.warning("gdrive download %s failed: %s", entryPath, exc) + result.failed += 1 + result.errors.append(f"download({entryPath}): {exc}") + return + + # Adapter.download returns raw bytes today; guard DownloadResult shape too. + fileBytes: bytes + if isinstance(downloaded, (bytes, bytearray)): + fileBytes = bytes(downloaded) + else: + fileBytes = bytes(getattr(downloaded, "data", b"") or b"") + if getattr(downloaded, "mimeType", None): + mimeType = downloaded.mimeType # export may have changed the type + if not fileBytes: + result.failed += 1 + return + if len(fileBytes) > limits.maxFileSize: + result.skippedPolicy += 1 + return + + result.bytesProcessed += len(fileBytes) + + try: + extracted = runExtractionFn( + fileBytes, fileName, mimeType, + ExtractionOptions(mergeStrategy=None), + ) + except Exception as exc: + logger.warning("gdrive extraction %s failed: %s", entryPath, exc) + result.failed += 1 + result.errors.append(f"extract({entryPath}): {exc}") + return + + contentObjects = _toContentObjects(extracted, fileName) + if not contentObjects: + result.skippedPolicy += 1 + return + + try: + handle = await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="gdrive_item", + sourceId=syntheticFileId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + contentVersion=revision, + neutralize=limits.neutralize, + provenance={ + "connectionId": connectionId, + "authority": "google", + "service": "drive", + "externalItemId": externalItemId, + "entryPath": entryPath, + "tier": "body", + }, + ) + ) + except Exception as exc: + logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True) + result.failed += 1 + result.errors.append(f"ingest({entryPath}): {exc}") + return + + if handle.status == "duplicate": + result.skippedDuplicate += 1 + elif handle.status == "indexed": + result.indexed += 1 + else: + result.failed += 1 + + if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: + processed = result.indexed + result.skippedDuplicate + try: + progressCb( + min(90, 10 + int(80 * processed / max(1, limits.maxItems))), + f"gdrive processed={processed}", + ) + except Exception: + pass + logger.info( + "ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d", + processed, result.skippedDuplicate, result.failed, + extra={ + "event": "ingestion.connection.bootstrap.progress", + "part": "gdrive", + "connectionId": connectionId, + "processed": processed, + "skippedDup": result.skippedDuplicate, + "failed": result.failed, + }, + ) + + +def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]: + durationMs = int((time.time() - startMs) * 1000) + logger.info( + "ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d", + connectionId, + result.indexed, result.skippedDuplicate, result.skippedPolicy, + result.failed, result.bytesProcessed, durationMs, + extra={ + "event": "ingestion.connection.bootstrap.done", + "part": "gdrive", + "connectionId": connectionId, + "indexed": result.indexed, + "skippedDup": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "bytes": result.bytesProcessed, + "durationMs": durationMs, + }, + ) + return { + "connectionId": result.connectionId, + "indexed": result.indexed, + "skippedDuplicate": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "bytesProcessed": result.bytesProcessed, + "durationMs": durationMs, + "errors": result.errors[:20], + } diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py new file mode 100644 index 00000000..21fec83d --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py @@ -0,0 +1,606 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Gmail bootstrap for the unified knowledge ingestion lane. + +Mirrors the Outlook pilot (see subConnectorSyncOutlook.py) but talks to Google +Mail's REST API. Messages become `sourceKind="gmail_message"` virtual documents +with header / snippet / cleaned body content-objects; attachments are optional +child jobs with `sourceKind="gmail_attachment"`. + +Idempotency: Gmail's stable `historyId` (or `internalDate` as fallback) is +passed as `contentVersion`, so rerunning the bootstrap yields +`ingestion.skipped.duplicate` for unchanged messages. +""" + +from __future__ import annotations + +import asyncio +import base64 +import hashlib +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from typing import Any, Callable, Dict, List, Optional + +from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody + +logger = logging.getLogger(__name__) + +MAX_MESSAGES_DEFAULT = 500 +MAX_BODY_CHARS_DEFAULT = 8000 +MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024 +DEFAULT_LABELS = ("INBOX", "SENT") + + +@dataclass +class GmailBootstrapLimits: + maxMessages: int = MAX_MESSAGES_DEFAULT + labels: tuple = DEFAULT_LABELS + maxBodyChars: int = MAX_BODY_CHARS_DEFAULT + includeAttachments: bool = False + maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT + # Only fetch messages newer than N days. None disables filter. + maxAgeDays: Optional[int] = 90 + # Content depth: "metadata" | "snippet" | "full" + mailContentDepth: str = "full" + # Pass-through to IngestionJob.neutralize + neutralize: bool = False + + +@dataclass +class GmailBootstrapResult: + connectionId: str + indexed: int = 0 + skippedDuplicate: int = 0 + skippedPolicy: int = 0 + failed: int = 0 + attachmentsIndexed: int = 0 + errors: List[str] = field(default_factory=list) + + +def _syntheticMessageId(connectionId: str, messageId: str) -> str: + token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16] + return f"gm:{connectionId[:8]}:{token}" + + +def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str: + token = hashlib.sha256( + f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8") + ).hexdigest()[:16] + return f"ga:{connectionId[:8]}:{token}" + + +def _decodeBase64Url(data: str) -> bytes: + if not data: + return b"" + # Gmail uses URL-safe base64 without padding. + padding = 4 - (len(data) % 4) + if padding != 4: + data = data + ("=" * padding) + try: + return base64.urlsafe_b64decode(data) + except Exception: + return b"" + + +def _walkPayloadForBody(payload: Dict[str, Any]) -> Dict[str, str]: + """Return {"text": ..., "html": ...} by walking MIME parts. + + Gmail `payload` is a tree of parts. We prefer `text/plain` for the cleaned + body, but capture `text/html` as a fallback so `cleanEmailBody` can strip + markup if plain is missing. + """ + found: Dict[str, str] = {"text": "", "html": ""} + + def _walk(part: Dict[str, Any]) -> None: + mime = (part.get("mimeType") or "").lower() + body = part.get("body") or {} + raw = body.get("data") or "" + if raw and mime.startswith("text/"): + decoded = _decodeBase64Url(raw).decode("utf-8", errors="replace") + key = "text" if mime == "text/plain" else ("html" if mime == "text/html" else "") + if key and not found[key]: + found[key] = decoded + for sub in part.get("parts") or []: + _walk(sub) + + _walk(payload or {}) + return found + + +def _headerMap(payload: Dict[str, Any]) -> Dict[str, str]: + return { + (h.get("name") or "").lower(): (h.get("value") or "") + for h in (payload.get("headers") or []) + } + + +def _buildContentObjects( + message: Dict[str, Any], + maxBodyChars: int, + mailContentDepth: str = "full", +) -> List[Dict[str, Any]]: + """Build content objects for a Gmail message. + + `mailContentDepth` controls how much is embedded: + - "metadata": header only (subject, from, to, date) + - "snippet": header + Gmail snippet (~155 chars, no full body) + - "full": header + snippet + cleaned full body (default) + """ + payload = message.get("payload") or {} + headers = _headerMap(payload) + subject = headers.get("subject") or "(no subject)" + fromAddr = headers.get("from") or "" + toAddr = headers.get("to") or "" + ccAddr = headers.get("cc") or "" + date = headers.get("date") or "" + snippet = message.get("snippet") or "" + + parts: List[Dict[str, Any]] = [] + header = ( + f"Subject: {subject}\n" + f"From: {fromAddr}\n" + f"To: {toAddr}\n" + + (f"Cc: {ccAddr}\n" if ccAddr else "") + + f"Date: {date}" + ) + parts.append({ + "contentObjectId": "header", + "contentType": "text", + "data": header, + "contextRef": {"part": "header"}, + }) + if mailContentDepth in ("snippet", "full") and snippet: + parts.append({ + "contentObjectId": "snippet", + "contentType": "text", + "data": snippet, + "contextRef": {"part": "snippet"}, + }) + if mailContentDepth == "full": + bodies = _walkPayloadForBody(payload) + rawBody = bodies["text"] or bodies["html"] + cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else "" + if cleanedBody: + parts.append({ + "contentObjectId": "body", + "contentType": "text", + "data": cleanedBody, + "contextRef": {"part": "body"}, + }) + return parts + + +async def bootstrapGmail( + connectionId: str, + *, + progressCb: Optional[Callable[[int, Optional[str]], None]] = None, + adapter: Any = None, + connection: Any = None, + knowledgeService: Any = None, + limits: Optional[GmailBootstrapLimits] = None, + googleGetFn: Optional[Callable[..., Any]] = None, +) -> Dict[str, Any]: + """Enumerate Gmail labels (INBOX + SENT default) and ingest messages.""" + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + prefs = loadConnectionPrefs(connectionId) + + if not limits: + limits = GmailBootstrapLimits( + includeAttachments=prefs.mailIndexAttachments, + maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, + mailContentDepth=prefs.mailContentDepth, + neutralize=prefs.neutralizeBeforeEmbed, + ) + + startMs = time.time() + result = GmailBootstrapResult(connectionId=connectionId) + + logger.info( + "ingestion.connection.bootstrap.started part=gmail connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.started", + "part": "gmail", + "connectionId": connectionId, + }, + ) + + if adapter is None or knowledgeService is None or connection is None: + adapter, connection, knowledgeService = await _resolveDependencies(connectionId) + + if googleGetFn is None: + from modules.connectors.providerGoogle.connectorGoogle import _googleGet as _defaultGet + + token = getattr(adapter, "_token", "") + + async def googleGetFn(url: str) -> Dict[str, Any]: # type: ignore[no-redef] + return await _defaultGet(token, url) + + mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" + userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" + + for labelId in limits.labels: + if result.indexed + result.skippedDuplicate >= limits.maxMessages: + break + try: + await _ingestLabel( + googleGetFn=googleGetFn, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + labelId=labelId, + limits=limits, + result=result, + progressCb=progressCb, + ) + except Exception as exc: + logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True) + result.errors.append(f"label({labelId}): {exc}") + + return _finalizeResult(connectionId, result, startMs) + + +async def _resolveDependencies(connectionId: str): + from modules.interfaces.interfaceDbApp import getRootInterface + from modules.auth import TokenManager + from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector + from modules.serviceCenter import getService + from modules.serviceCenter.context import ServiceCenterContext + from modules.security.rootAccess import getRootUser + + rootInterface = getRootInterface() + connection = rootInterface.getUserConnectionById(connectionId) + if connection is None: + raise ValueError(f"UserConnection not found: {connectionId}") + + token = TokenManager().getFreshToken(connectionId) + if not token or not token.tokenAccess: + raise ValueError(f"No valid token for connection {connectionId}") + + provider = GoogleConnector(connection, token.tokenAccess) + adapter = provider.getServiceAdapter("gmail") + + rootUser = getRootUser() + ctx = ServiceCenterContext( + user=rootUser, + mandate_id=str(getattr(connection, "mandateId", "") or ""), + ) + knowledgeService = getService("knowledge", ctx) + return adapter, connection, knowledgeService + + +async def _ingestLabel( + *, + googleGetFn, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + labelId: str, + limits: GmailBootstrapLimits, + result: GmailBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate) + if remaining <= 0: + return + + pageSize = min(100, remaining) + query = "" + if limits.maxAgeDays: + cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays) + # Gmail uses YYYY/MM/DD. + query = f"after:{cutoff.strftime('%Y/%m/%d')}" + + baseUrl = ( + "https://gmail.googleapis.com/gmail/v1/users/me/messages" + f"?labelIds={labelId}&maxResults={pageSize}" + ) + if query: + baseUrl = f"{baseUrl}&q={query}" + + nextPageToken: Optional[str] = None + while (result.indexed + result.skippedDuplicate) < limits.maxMessages: + url = baseUrl if not nextPageToken else f"{baseUrl}&pageToken={nextPageToken}" + page = await googleGetFn(url) + if not isinstance(page, dict) or "error" in page: + err = (page or {}).get("error") if isinstance(page, dict) else "unknown" + logger.warning("gmail list page error for label %s: %s", labelId, err) + result.errors.append(f"list({labelId}): {err}") + return + + messageStubs = page.get("messages") or [] + for stub in messageStubs: + if result.indexed + result.skippedDuplicate >= limits.maxMessages: + break + msgId = stub.get("id") + if not msgId: + continue + detailUrl = ( + f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{msgId}?format=full" + ) + detail = await googleGetFn(detailUrl) + if not isinstance(detail, dict) or "error" in detail: + result.failed += 1 + continue + await _ingestMessage( + googleGetFn=googleGetFn, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + labelId=labelId, + message=detail, + limits=limits, + result=result, + progressCb=progressCb, + ) + + nextPageToken = page.get("nextPageToken") + if not nextPageToken: + break + + +async def _ingestMessage( + *, + googleGetFn, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + labelId: str, + message: Dict[str, Any], + limits: GmailBootstrapLimits, + result: GmailBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + messageId = message.get("id") + if not messageId: + result.skippedPolicy += 1 + return + revision = message.get("historyId") or message.get("internalDate") + headers = _headerMap(message.get("payload") or {}) + subject = headers.get("subject") or "(no subject)" + syntheticId = _syntheticMessageId(connectionId, messageId) + fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml" + + contentObjects = _buildContentObjects( + message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth + ) + try: + handle = await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="gmail_message", + sourceId=syntheticId, + fileName=fileName, + mimeType="message/rfc822", + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + contentVersion=str(revision) if revision else None, + neutralize=limits.neutralize, + provenance={ + "connectionId": connectionId, + "authority": "google", + "service": "gmail", + "externalItemId": messageId, + "label": labelId, + "threadId": message.get("threadId"), + "tier": limits.mailContentDepth, + }, + ) + ) + except Exception as exc: + logger.error("gmail ingestion %s failed: %s", messageId, exc, exc_info=True) + result.failed += 1 + result.errors.append(f"ingest({messageId}): {exc}") + return + + if handle.status == "duplicate": + result.skippedDuplicate += 1 + elif handle.status == "indexed": + result.indexed += 1 + else: + result.failed += 1 + + if limits.includeAttachments: + try: + await _ingestAttachments( + googleGetFn=googleGetFn, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + message=message, + parentSyntheticId=syntheticId, + limits=limits, + result=result, + ) + except Exception as exc: + logger.warning("gmail attachments %s failed: %s", messageId, exc) + result.errors.append(f"attachments({messageId}): {exc}") + + if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: + processed = result.indexed + result.skippedDuplicate + try: + progressCb( + min(90, 10 + int(80 * processed / max(1, limits.maxMessages))), + f"gmail processed={processed}", + ) + except Exception: + pass + logger.info( + "ingestion.connection.bootstrap.progress part=gmail processed=%d skippedDup=%d failed=%d", + processed, result.skippedDuplicate, result.failed, + extra={ + "event": "ingestion.connection.bootstrap.progress", + "part": "gmail", + "connectionId": connectionId, + "processed": processed, + "skippedDup": result.skippedDuplicate, + "failed": result.failed, + }, + ) + + await asyncio.sleep(0) + + +async def _ingestAttachments( + *, + googleGetFn, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + message: Dict[str, Any], + parentSyntheticId: str, + limits: GmailBootstrapLimits, + result: GmailBootstrapResult, +) -> None: + """Child ingestion jobs for file attachments. Skips inline images (cid: refs).""" + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + from modules.datamodels.datamodelExtraction import ExtractionOptions + from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction + from modules.serviceCenter.services.serviceExtraction.subRegistry import ( + ExtractorRegistry, ChunkerRegistry, + ) + + messageId = message.get("id") or "" + + def _collectAttachmentStubs(part: Dict[str, Any], acc: List[Dict[str, Any]]) -> None: + filename = part.get("filename") or "" + body = part.get("body") or {} + attId = body.get("attachmentId") + if filename and attId: + acc.append({ + "filename": filename, + "mimeType": part.get("mimeType") or "application/octet-stream", + "attachmentId": attId, + "size": int(body.get("size") or 0), + }) + for sub in part.get("parts") or []: + _collectAttachmentStubs(sub, acc) + + stubs: List[Dict[str, Any]] = [] + _collectAttachmentStubs(message.get("payload") or {}, stubs) + if not stubs: + return + + extractorRegistry = ExtractorRegistry() + chunkerRegistry = ChunkerRegistry() + + for stub in stubs: + if stub["size"] and stub["size"] > limits.maxAttachmentBytes: + result.skippedPolicy += 1 + continue + attUrl = ( + f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{messageId}" + f"/attachments/{stub['attachmentId']}" + ) + detail = await googleGetFn(attUrl) + if not isinstance(detail, dict) or "error" in detail: + result.failed += 1 + continue + rawBytes = _decodeBase64Url(detail.get("data") or "") + if not rawBytes: + continue + fileName = stub["filename"] + mimeType = stub["mimeType"] + syntheticId = _syntheticAttachmentId(connectionId, messageId, stub["attachmentId"]) + + try: + extracted = runExtraction( + extractorRegistry, chunkerRegistry, + rawBytes, fileName, mimeType, + ExtractionOptions(mergeStrategy=None), + ) + except Exception as exc: + logger.warning("gmail attachment extract %s failed: %s", stub["attachmentId"], exc) + result.failed += 1 + continue + + contentObjects: List[Dict[str, Any]] = [] + for part in getattr(extracted, "parts", None) or []: + data = getattr(part, "data", None) or "" + if not data or not str(data).strip(): + continue + typeGroup = getattr(part, "typeGroup", "text") or "text" + contentType = "text" + if typeGroup == "image": + contentType = "image" + elif typeGroup in ("binary", "container"): + contentType = "other" + contentObjects.append({ + "contentObjectId": getattr(part, "id", ""), + "contentType": contentType, + "data": data, + "contextRef": { + "containerPath": fileName, + "location": getattr(part, "label", None) or "attachment", + **(getattr(part, "metadata", None) or {}), + }, + }) + if not contentObjects: + result.skippedPolicy += 1 + continue + + try: + await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="gmail_attachment", + sourceId=syntheticId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + provenance={ + "connectionId": connectionId, + "authority": "google", + "service": "gmail", + "parentId": parentSyntheticId, + "externalItemId": stub["attachmentId"], + "parentMessageId": messageId, + }, + ) + ) + result.attachmentsIndexed += 1 + except Exception as exc: + logger.warning("gmail attachment ingest %s failed: %s", stub["attachmentId"], exc) + result.failed += 1 + + +def _finalizeResult(connectionId: str, result: GmailBootstrapResult, startMs: float) -> Dict[str, Any]: + durationMs = int((time.time() - startMs) * 1000) + logger.info( + "ingestion.connection.bootstrap.done part=gmail connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d", + connectionId, + result.indexed, result.skippedDuplicate, result.skippedPolicy, + result.attachmentsIndexed, result.failed, durationMs, + extra={ + "event": "ingestion.connection.bootstrap.done", + "part": "gmail", + "connectionId": connectionId, + "indexed": result.indexed, + "skippedDup": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "attachmentsIndexed": result.attachmentsIndexed, + "failed": result.failed, + "durationMs": durationMs, + }, + ) + return { + "connectionId": result.connectionId, + "indexed": result.indexed, + "skippedDuplicate": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "attachmentsIndexed": result.attachmentsIndexed, + "failed": result.failed, + "durationMs": durationMs, + "errors": result.errors[:20], + } diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py new file mode 100644 index 00000000..64a3545f --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py @@ -0,0 +1,576 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Outlook bootstrap for the unified knowledge ingestion lane. + +Unlike SharePoint, Outlook messages are "virtual documents" — we never persist +file bytes in the store. Each message becomes a `sourceKind="outlook_message"` +IngestionJob whose `contentObjects` carry the header, snippet and cleaned body +so retrieval can show a compact answer without fetching Graph again. + +Attachments are optional (`includeAttachments` limit flag) and enqueued as +child jobs with `sourceKind="outlook_attachment"` + `provenance.parentId`. +""" + +from __future__ import annotations + +import asyncio +import hashlib +import logging +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional + +from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody + +logger = logging.getLogger(__name__) + +MAX_MESSAGES_DEFAULT = 500 +MAX_FOLDERS_DEFAULT = 5 +MAX_BODY_CHARS_DEFAULT = 8000 +MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024 +WELL_KNOWN_FOLDERS = ("inbox", "sentitems") + + +@dataclass +class OutlookBootstrapLimits: + maxMessages: int = MAX_MESSAGES_DEFAULT + maxFolders: int = MAX_FOLDERS_DEFAULT + maxBodyChars: int = MAX_BODY_CHARS_DEFAULT + includeAttachments: bool = False + maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT + # Only fetch messages newer than N days. None disables filter. + maxAgeDays: Optional[int] = 90 + # Content depth: "metadata" | "snippet" | "full" + mailContentDepth: str = "full" + # Pass-through to IngestionJob.neutralize + neutralize: bool = False + + +@dataclass +class OutlookBootstrapResult: + connectionId: str + indexed: int = 0 + skippedDuplicate: int = 0 + skippedPolicy: int = 0 + failed: int = 0 + attachmentsIndexed: int = 0 + errors: List[str] = field(default_factory=list) + + +def _syntheticMessageId(connectionId: str, messageId: str) -> str: + token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16] + return f"om:{connectionId[:8]}:{token}" + + +def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str: + token = hashlib.sha256( + f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8") + ).hexdigest()[:16] + return f"oa:{connectionId[:8]}:{token}" + + +def _extractRecipient(recipient: Dict[str, Any]) -> str: + email = (recipient or {}).get("emailAddress") or {} + name = email.get("name") or "" + addr = email.get("address") or "" + if name and addr: + return f"{name} <{addr}>" + return addr or name + + +def _joinRecipients(recipients: List[Dict[str, Any]]) -> str: + return ", ".join(filter(None, [_extractRecipient(r) for r in recipients or []])) + + +def _buildContentObjects( + message: Dict[str, Any], + maxBodyChars: int, + mailContentDepth: str = "full", +) -> List[Dict[str, Any]]: + """Build content objects for an Outlook message. + + `mailContentDepth` mirrors the Gmail walker: + - "metadata": header only + - "snippet": header + bodyPreview (~255 chars) + - "full": header + snippet + cleaned body (default) + """ + subject = message.get("subject") or "(no subject)" + fromAddr = _extractRecipient(message.get("from") or {}) + toAddr = _joinRecipients(message.get("toRecipients") or []) + ccAddr = _joinRecipients(message.get("ccRecipients") or []) + received = message.get("receivedDateTime") or "" + snippet = message.get("bodyPreview") or "" + + parts: List[Dict[str, Any]] = [] + header = ( + f"Subject: {subject}\n" + f"From: {fromAddr}\n" + f"To: {toAddr}\n" + + (f"Cc: {ccAddr}\n" if ccAddr else "") + + f"Date: {received}" + ) + parts.append({ + "contentObjectId": "header", + "contentType": "text", + "data": header, + "contextRef": {"part": "header"}, + }) + if mailContentDepth in ("snippet", "full") and snippet: + parts.append({ + "contentObjectId": "snippet", + "contentType": "text", + "data": snippet, + "contextRef": {"part": "snippet"}, + }) + if mailContentDepth == "full": + body = message.get("body") or {} + bodyContent = body.get("content") or "" + cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) if bodyContent else "" + if cleanedBody: + parts.append({ + "contentObjectId": "body", + "contentType": "text", + "data": cleanedBody, + "contextRef": {"part": "body"}, + }) + return parts + + +async def bootstrapOutlook( + connectionId: str, + *, + progressCb: Optional[Callable[[int, Optional[str]], None]] = None, + adapter: Any = None, + connection: Any = None, + knowledgeService: Any = None, + limits: Optional[OutlookBootstrapLimits] = None, +) -> Dict[str, Any]: + """Enumerate Outlook folders (inbox + sent by default) and ingest messages.""" + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + prefs = loadConnectionPrefs(connectionId) + + if not limits: + limits = OutlookBootstrapLimits( + includeAttachments=prefs.mailIndexAttachments, + maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, + mailContentDepth=prefs.mailContentDepth, + neutralize=prefs.neutralizeBeforeEmbed, + ) + + startMs = time.time() + result = OutlookBootstrapResult(connectionId=connectionId) + + logger.info( + "ingestion.connection.bootstrap.started part=outlook connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.started", + "part": "outlook", + "connectionId": connectionId, + }, + ) + + if adapter is None or knowledgeService is None or connection is None: + adapter, connection, knowledgeService = await _resolveDependencies(connectionId) + + mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" + userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" + + folderIds = await _selectFolderIds(adapter, limits) + for folderId in folderIds: + if result.indexed + result.skippedDuplicate >= limits.maxMessages: + break + try: + await _ingestFolder( + adapter=adapter, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + folderId=folderId, + limits=limits, + result=result, + progressCb=progressCb, + ) + except Exception as exc: + logger.error("outlook ingestion folder %s failed: %s", folderId, exc, exc_info=True) + result.errors.append(f"folder({folderId}): {exc}") + + return _finalizeResult(connectionId, result, startMs) + + +async def _resolveDependencies(connectionId: str): + from modules.interfaces.interfaceDbApp import getRootInterface + from modules.auth import TokenManager + from modules.connectors.providerMsft.connectorMsft import MsftConnector + from modules.serviceCenter import getService + from modules.serviceCenter.context import ServiceCenterContext + from modules.security.rootAccess import getRootUser + + rootInterface = getRootInterface() + connection = rootInterface.getUserConnectionById(connectionId) + if connection is None: + raise ValueError(f"UserConnection not found: {connectionId}") + + token = TokenManager().getFreshToken(connectionId) + if not token or not token.tokenAccess: + raise ValueError(f"No valid token for connection {connectionId}") + + provider = MsftConnector(connection, token.tokenAccess) + adapter = provider.getServiceAdapter("outlook") + + rootUser = getRootUser() + ctx = ServiceCenterContext( + user=rootUser, + mandate_id=str(getattr(connection, "mandateId", "") or ""), + ) + knowledgeService = getService("knowledge", ctx) + return adapter, connection, knowledgeService + + +async def _selectFolderIds(adapter, limits: OutlookBootstrapLimits) -> List[str]: + """Prefer well-known folders (inbox, sentitems); fall back to browse().""" + folderIds: List[str] = [] + for wellKnown in WELL_KNOWN_FOLDERS: + if len(folderIds) >= limits.maxFolders: + break + try: + row = await adapter._graphGet(f"me/mailFolders/{wellKnown}") + except Exception: + row = None + if isinstance(row, dict) and "error" not in row and row.get("id"): + folderIds.append(row["id"]) + + if len(folderIds) < limits.maxFolders: + try: + entries = await adapter.browse("/") + except Exception: + entries = [] + for entry in entries: + metadata = getattr(entry, "metadata", {}) or {} + fid = metadata.get("id") + if fid and fid not in folderIds: + folderIds.append(fid) + if len(folderIds) >= limits.maxFolders: + break + return folderIds + + +async def _ingestFolder( + *, + adapter, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + folderId: str, + limits: OutlookBootstrapLimits, + result: OutlookBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate) + if remaining <= 0: + return + + pageSize = min(100, remaining) + select = ( + "id,subject,from,toRecipients,ccRecipients,receivedDateTime," + "bodyPreview,body,internetMessageId,hasAttachments,changeKey" + ) + endpoint: Optional[str] = ( + f"me/mailFolders/{folderId}/messages" + f"?$top={pageSize}&$orderby=receivedDateTime desc&$select={select}" + ) + + # Keep header-based age filter in Graph itself to avoid shipping ancient + # messages we'd discard client-side. + if limits.maxAgeDays: + from datetime import datetime, timezone, timedelta + + cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays) + cutoffIso = cutoff.strftime("%Y-%m-%dT%H:%M:%SZ") + endpoint = f"{endpoint}&$filter=receivedDateTime ge {cutoffIso}" + + while endpoint and (result.indexed + result.skippedDuplicate) < limits.maxMessages: + try: + page = await adapter._graphGet(endpoint) + except Exception as exc: + logger.warning("outlook graph page failed for folder %s: %s", folderId, exc) + result.errors.append(f"graph({folderId}): {exc}") + return + if not isinstance(page, dict) or "error" in page: + err = (page or {}).get("error") if isinstance(page, dict) else "unknown" + logger.warning("outlook graph page error for folder %s: %s", folderId, err) + result.errors.append(f"graph({folderId}): {err}") + return + + for message in page.get("value", []) or []: + if result.indexed + result.skippedDuplicate >= limits.maxMessages: + break + await _ingestMessage( + adapter=adapter, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + message=message, + limits=limits, + result=result, + progressCb=progressCb, + ) + + nextLink = page.get("@odata.nextLink") + if not nextLink: + break + # Strip Graph base so adapter._graphGet accepts the relative path. + from modules.connectors.providerMsft.connectorMsft import _stripGraphBase + + endpoint = _stripGraphBase(nextLink) + + +async def _ingestMessage( + *, + adapter, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + message: Dict[str, Any], + limits: OutlookBootstrapLimits, + result: OutlookBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + messageId = message.get("id") + if not messageId: + result.skippedPolicy += 1 + return + revision = message.get("changeKey") or message.get("internetMessageId") + subject = message.get("subject") or "(no subject)" + syntheticId = _syntheticMessageId(connectionId, messageId) + fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml" + + contentObjects = _buildContentObjects( + message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth + ) + # Always at least the header is emitted, so `contentObjects` is non-empty. + try: + handle = await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="outlook_message", + sourceId=syntheticId, + fileName=fileName, + mimeType="message/rfc822", + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + contentVersion=revision, + neutralize=limits.neutralize, + provenance={ + "connectionId": connectionId, + "authority": "msft", + "service": "outlook", + "externalItemId": messageId, + "internetMessageId": message.get("internetMessageId"), + "tier": limits.mailContentDepth, + }, + ) + ) + except Exception as exc: + logger.error("outlook ingestion %s failed: %s", messageId, exc, exc_info=True) + result.failed += 1 + result.errors.append(f"ingest({messageId}): {exc}") + return + + if handle.status == "duplicate": + result.skippedDuplicate += 1 + elif handle.status == "indexed": + result.indexed += 1 + else: + result.failed += 1 + + if limits.includeAttachments and message.get("hasAttachments"): + try: + await _ingestAttachments( + adapter=adapter, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + messageId=messageId, + parentSyntheticId=syntheticId, + limits=limits, + result=result, + ) + except Exception as exc: + logger.warning("outlook attachments %s failed: %s", messageId, exc) + result.errors.append(f"attachments({messageId}): {exc}") + + if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: + processed = result.indexed + result.skippedDuplicate + try: + progressCb( + min(90, 10 + int(80 * processed / max(1, limits.maxMessages))), + f"outlook processed={processed}", + ) + except Exception: + pass + logger.info( + "ingestion.connection.bootstrap.progress part=outlook processed=%d skippedDup=%d failed=%d", + processed, result.skippedDuplicate, result.failed, + extra={ + "event": "ingestion.connection.bootstrap.progress", + "part": "outlook", + "connectionId": connectionId, + "processed": processed, + "skippedDup": result.skippedDuplicate, + "failed": result.failed, + }, + ) + + await asyncio.sleep(0) + + +async def _ingestAttachments( + *, + adapter, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + messageId: str, + parentSyntheticId: str, + limits: OutlookBootstrapLimits, + result: OutlookBootstrapResult, +) -> None: + """Child ingestion jobs for file attachments (skip inline & oversized).""" + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + from modules.datamodels.datamodelExtraction import ExtractionOptions + from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction + from modules.serviceCenter.services.serviceExtraction.subRegistry import ( + ExtractorRegistry, ChunkerRegistry, + ) + import base64 + + page = await adapter._graphGet(f"me/messages/{messageId}/attachments") + if not isinstance(page, dict) or "error" in page: + return + + extractorRegistry = ExtractorRegistry() + chunkerRegistry = ChunkerRegistry() + + for attachment in page.get("value", []) or []: + if attachment.get("@odata.type") != "#microsoft.graph.fileAttachment": + continue + if attachment.get("isInline"): + continue + size = int(attachment.get("size") or 0) + if size and size > limits.maxAttachmentBytes: + result.skippedPolicy += 1 + continue + contentBytesB64 = attachment.get("contentBytes") + if not contentBytesB64: + continue + try: + rawBytes = base64.b64decode(contentBytesB64) + except Exception: + result.skippedPolicy += 1 + continue + fileName = attachment.get("name") or "attachment" + mimeType = attachment.get("contentType") or "application/octet-stream" + attachmentId = attachment.get("id") or fileName + syntheticId = _syntheticAttachmentId(connectionId, messageId, attachmentId) + + try: + extracted = runExtraction( + extractorRegistry, chunkerRegistry, + rawBytes, fileName, mimeType, + ExtractionOptions(mergeStrategy=None), + ) + except Exception as exc: + logger.warning("outlook attachment extract %s failed: %s", attachmentId, exc) + result.failed += 1 + continue + + contentObjects: List[Dict[str, Any]] = [] + for part in getattr(extracted, "parts", None) or []: + data = getattr(part, "data", None) or "" + if not data or not str(data).strip(): + continue + typeGroup = getattr(part, "typeGroup", "text") or "text" + contentType = "text" + if typeGroup == "image": + contentType = "image" + elif typeGroup in ("binary", "container"): + contentType = "other" + contentObjects.append({ + "contentObjectId": getattr(part, "id", ""), + "contentType": contentType, + "data": data, + "contextRef": { + "containerPath": fileName, + "location": getattr(part, "label", None) or "attachment", + **(getattr(part, "metadata", None) or {}), + }, + }) + if not contentObjects: + result.skippedPolicy += 1 + continue + + try: + await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="outlook_attachment", + sourceId=syntheticId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + neutralize=limits.neutralize, + provenance={ + "connectionId": connectionId, + "authority": "msft", + "service": "outlook", + "parentId": parentSyntheticId, + "externalItemId": attachmentId, + "parentMessageId": messageId, + }, + ) + ) + result.attachmentsIndexed += 1 + except Exception as exc: + logger.warning("outlook attachment ingest %s failed: %s", attachmentId, exc) + result.failed += 1 + + +def _finalizeResult(connectionId: str, result: OutlookBootstrapResult, startMs: float) -> Dict[str, Any]: + durationMs = int((time.time() - startMs) * 1000) + logger.info( + "ingestion.connection.bootstrap.done part=outlook connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d", + connectionId, + result.indexed, result.skippedDuplicate, result.skippedPolicy, + result.attachmentsIndexed, result.failed, durationMs, + extra={ + "event": "ingestion.connection.bootstrap.done", + "part": "outlook", + "connectionId": connectionId, + "indexed": result.indexed, + "skippedDup": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "attachmentsIndexed": result.attachmentsIndexed, + "failed": result.failed, + "durationMs": durationMs, + }, + ) + return { + "connectionId": result.connectionId, + "indexed": result.indexed, + "skippedDuplicate": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "attachmentsIndexed": result.attachmentsIndexed, + "failed": result.failed, + "durationMs": durationMs, + "errors": result.errors[:20], + } diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py new file mode 100644 index 00000000..07fef7a8 --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py @@ -0,0 +1,433 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""SharePoint bootstrap for the unified knowledge ingestion lane. + +Walks the SharePoint drive(s) reachable via a UserConnection, downloads each +file-like item, runs the standard content extraction pipeline and hands the +result to `KnowledgeService.requestIngestion`. Idempotency is provided by the +ingestion façade itself; repeat bootstraps therefore produce +`ingestion.skipped.duplicate` for every unchanged item because we pass the +Graph `eTag` as `contentVersion`. +""" + +from __future__ import annotations + +import asyncio +import hashlib +import logging +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional + +from modules.datamodels.datamodelExtraction import ExtractionOptions + +logger = logging.getLogger(__name__) + +MAX_ITEMS_DEFAULT = 500 +MAX_BYTES_DEFAULT = 200 * 1024 * 1024 +MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024 +SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/") +MAX_DEPTH_DEFAULT = 4 +MAX_SITES_DEFAULT = 3 + + +@dataclass +class SharepointBootstrapLimits: + maxItems: int = MAX_ITEMS_DEFAULT + maxBytes: int = MAX_BYTES_DEFAULT + maxFileSize: int = MAX_FILE_SIZE_DEFAULT + skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT + maxDepth: int = MAX_DEPTH_DEFAULT + maxSites: int = MAX_SITES_DEFAULT + # Pass-through to IngestionJob.neutralize + neutralize: bool = False + + +@dataclass +class SharepointBootstrapResult: + connectionId: str + indexed: int = 0 + skippedDuplicate: int = 0 + skippedPolicy: int = 0 + failed: int = 0 + bytesProcessed: int = 0 + errors: List[str] = field(default_factory=list) + + +def _syntheticFileId(connectionId: str, externalItemId: str) -> str: + """Deterministic synthetic FileContentIndex id for a SharePoint item. + + Stable across bootstraps → idempotency works; independent of file name so + moves/renames don't duplicate chunks. + """ + token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16] + return f"sp:{connectionId[:8]}:{token}" + + +def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]: + """Translate ExtractionResult → content objects accepted by requestIngestion.""" + parts = getattr(extracted, "parts", None) or [] + out: List[Dict[str, Any]] = [] + for part in parts: + data = getattr(part, "data", None) or "" + if not data or not str(data).strip(): + continue + typeGroup = getattr(part, "typeGroup", "text") or "text" + contentType = "text" + if typeGroup == "image": + contentType = "image" + elif typeGroup in ("binary", "container"): + contentType = "other" + out.append({ + "contentObjectId": getattr(part, "id", ""), + "contentType": contentType, + "data": data, + "contextRef": { + "containerPath": fileName, + "location": getattr(part, "label", None) or "file", + **(getattr(part, "metadata", None) or {}), + }, + }) + return out + + +async def bootstrapSharepoint( + connectionId: str, + *, + progressCb: Optional[Callable[[int, Optional[str]], None]] = None, + adapter: Any = None, + connection: Any = None, + knowledgeService: Any = None, + limits: Optional[SharepointBootstrapLimits] = None, + runExtractionFn: Optional[Callable[..., Any]] = None, +) -> Dict[str, Any]: + """Enumerate SharePoint drives and ingest every reachable file via the façade. + + Parameters allow injection for tests; production callers pass only + `connectionId` (and optionally a progressCb) and everything else is + resolved against the registered services. + """ + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + prefs = loadConnectionPrefs(connectionId) + + if not limits: + limits = SharepointBootstrapLimits(neutralize=prefs.neutralizeBeforeEmbed) + + startMs = time.time() + result = SharepointBootstrapResult(connectionId=connectionId) + + logger.info( + "ingestion.connection.bootstrap.started part=sharepoint connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.started", + "part": "sharepoint", + "connectionId": connectionId, + }, + ) + + if adapter is None or knowledgeService is None or connection is None: + adapter, connection, knowledgeService = await _resolveDependencies(connectionId) + if runExtractionFn is None: + from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction + from modules.serviceCenter.services.serviceExtraction.subRegistry import ( + ExtractorRegistry, ChunkerRegistry, + ) + extractorRegistry = ExtractorRegistry() + chunkerRegistry = ChunkerRegistry() + + def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef] + return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options) + + mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" + userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" + + try: + sites = await adapter.browse("/", limit=limits.maxSites) + except Exception as exc: + logger.error("sharepoint site discovery failed for %s: %s", connectionId, exc, exc_info=True) + result.errors.append(f"site_discovery: {exc}") + return _finalizeResult(connectionId, result, startMs) + + for site in sites[: limits.maxSites]: + if result.indexed + result.skippedDuplicate >= limits.maxItems: + break + sitePath = getattr(site, "path", "") or "" + try: + await _walkFolder( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + folderPath=sitePath, + depth=0, + limits=limits, + result=result, + progressCb=progressCb, + ) + except Exception as exc: + logger.error("sharepoint walk failed for site %s: %s", sitePath, exc, exc_info=True) + result.errors.append(f"walk({sitePath}): {exc}") + + return _finalizeResult(connectionId, result, startMs) + + +async def _resolveDependencies(connectionId: str): + """Load connection, instantiate SharepointAdapter, and build a KnowledgeService. + + Runs with root privileges: bootstrap is a system operation triggered by an + authenticated user via callback; it must not be gated by a per-user + service-center context. + """ + from modules.interfaces.interfaceDbApp import getRootInterface + from modules.auth import TokenManager + from modules.connectors.providerMsft.connectorMsft import MsftConnector + from modules.serviceCenter import getService + from modules.serviceCenter.context import ServiceCenterContext + from modules.security.rootAccess import getRootUser + + rootInterface = getRootInterface() + connection = rootInterface.getUserConnectionById(connectionId) + if connection is None: + raise ValueError(f"UserConnection not found: {connectionId}") + + token = TokenManager().getFreshToken(connectionId) + if not token or not token.tokenAccess: + raise ValueError(f"No valid token for connection {connectionId}") + + provider = MsftConnector(connection, token.tokenAccess) + adapter = provider.getServiceAdapter("sharepoint") + + rootUser = getRootUser() + ctx = ServiceCenterContext( + user=rootUser, + mandate_id=str(getattr(connection, "mandateId", "") or ""), + ) + knowledgeService = getService("knowledge", ctx) + return adapter, connection, knowledgeService + + +async def _walkFolder( + *, + adapter, + knowledgeService, + runExtractionFn, + connectionId: str, + mandateId: str, + userId: str, + folderPath: str, + depth: int, + limits: SharepointBootstrapLimits, + result: SharepointBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + if depth > limits.maxDepth: + return + try: + entries = await adapter.browse(folderPath) + except Exception as exc: + logger.warning("sharepoint browse %s failed: %s", folderPath, exc) + result.errors.append(f"browse({folderPath}): {exc}") + return + + for entry in entries: + if result.indexed + result.skippedDuplicate >= limits.maxItems: + return + if result.bytesProcessed >= limits.maxBytes: + return + + entryPath = getattr(entry, "path", "") or "" + if getattr(entry, "isFolder", False): + await _walkFolder( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + folderPath=entryPath, + depth=depth + 1, + limits=limits, + result=result, + progressCb=progressCb, + ) + continue + + mimeType = getattr(entry, "mimeType", None) or "application/octet-stream" + if any(mimeType.startswith(prefix) for prefix in limits.skipMimePrefixes): + result.skippedPolicy += 1 + continue + size = int(getattr(entry, "size", 0) or 0) + if size and size > limits.maxFileSize: + result.skippedPolicy += 1 + continue + + metadata = getattr(entry, "metadata", {}) or {} + externalItemId = metadata.get("id") or entryPath + revision = metadata.get("revision") or metadata.get("lastModifiedDateTime") + + await _ingestOne( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + entry=entry, + entryPath=entryPath, + mimeType=mimeType, + externalItemId=externalItemId, + revision=revision, + limits=limits, + result=result, + progressCb=progressCb, + ) + + +async def _ingestOne( + *, + adapter, + knowledgeService, + runExtractionFn, + connectionId: str, + mandateId: str, + userId: str, + entry, + entryPath: str, + mimeType: str, + externalItemId: str, + revision: Optional[str], + limits: SharepointBootstrapLimits, + result: SharepointBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + syntheticFileId = _syntheticFileId(connectionId, externalItemId) + fileName = getattr(entry, "name", "") or externalItemId + + try: + fileBytes = await adapter.download(entryPath) + except Exception as exc: + logger.warning("sharepoint download %s failed: %s", entryPath, exc) + result.failed += 1 + result.errors.append(f"download({entryPath}): {exc}") + return + if not fileBytes: + result.failed += 1 + return + + result.bytesProcessed += len(fileBytes) + + try: + extracted = runExtractionFn( + fileBytes, fileName, mimeType, + ExtractionOptions(mergeStrategy=None), + ) + except Exception as exc: + logger.warning("sharepoint extraction %s failed: %s", entryPath, exc) + result.failed += 1 + result.errors.append(f"extract({entryPath}): {exc}") + return + + contentObjects = _toContentObjects(extracted, fileName) + if not contentObjects: + result.skippedPolicy += 1 + return + + provenance: Dict[str, Any] = { + "connectionId": connectionId, + "authority": "msft", + "service": "sharepoint", + "externalItemId": externalItemId, + "externalPath": entryPath, + "revision": revision, + } + try: + handle = await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="sharepoint_item", + sourceId=syntheticFileId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + contentVersion=revision, + neutralize=limits.neutralize, + provenance=provenance, + ) + ) + except Exception as exc: + logger.error("sharepoint ingestion %s failed: %s", entryPath, exc, exc_info=True) + result.failed += 1 + result.errors.append(f"ingest({entryPath}): {exc}") + return + + if handle.status == "duplicate": + result.skippedDuplicate += 1 + elif handle.status == "indexed": + result.indexed += 1 + else: + result.failed += 1 + if handle.error: + result.errors.append(f"ingest({entryPath}): {handle.error}") + + if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: + processed = result.indexed + result.skippedDuplicate + try: + progressCb( + min(90, 10 + int(80 * processed / max(1, limits.maxItems))), + f"sharepoint processed={processed}", + ) + except Exception: + pass + logger.info( + "ingestion.connection.bootstrap.progress part=sharepoint processed=%d skippedDup=%d failed=%d", + processed, result.skippedDuplicate, result.failed, + extra={ + "event": "ingestion.connection.bootstrap.progress", + "part": "sharepoint", + "connectionId": connectionId, + "processed": processed, + "skippedDup": result.skippedDuplicate, + "failed": result.failed, + }, + ) + + # Yield so the event loop can interleave other tasks (download/extract are + # CPU-ish and extraction uses sync libs; cooperative scheduling prevents + # starving other workers). + await asyncio.sleep(0) + + +def _finalizeResult(connectionId: str, result: SharepointBootstrapResult, startMs: float) -> Dict[str, Any]: + durationMs = int((time.time() - startMs) * 1000) + logger.info( + "ingestion.connection.bootstrap.done part=sharepoint connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d durationMs=%d", + connectionId, + result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed, + durationMs, + extra={ + "event": "ingestion.connection.bootstrap.done", + "part": "sharepoint", + "connectionId": connectionId, + "indexed": result.indexed, + "skippedDup": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "durationMs": durationMs, + }, + ) + return { + "connectionId": result.connectionId, + "indexed": result.indexed, + "skippedDuplicate": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "bytesProcessed": result.bytesProcessed, + "durationMs": durationMs, + "errors": result.errors[:20], + } diff --git a/modules/serviceCenter/services/serviceKnowledge/subTextClean.py b/modules/serviceCenter/services/serviceKnowledge/subTextClean.py new file mode 100644 index 00000000..2d352cfa --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subTextClean.py @@ -0,0 +1,107 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Text normalisation utilities used by knowledge ingestion. + +The email body cleaning logic is intentionally regex-based and works on plain +text after an HTML→text pass so we never store unsanitised HTML/JS in the +knowledge store and retrieval stays robust (no extraneous markup tokens +eating embedding budget). +""" + +from __future__ import annotations + +import re +from typing import Optional + +DEFAULT_MAX_CHARS = 8000 + + +_QUOTE_MARKER_PATTERNS = [ + re.compile(r"^\s*(?:On\s.+?\swrote:)\s*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*(?:Am\s.+?\sschrieb.+?:)\s*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*-{2,}\s*Original\s*Message\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*-{2,}\s*Urspr.+Nachricht\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*From:\s+.+$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Von:\s+.+$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Sent:\s+.+$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Gesendet:\s+.+$", re.MULTILINE | re.IGNORECASE), +] + +_SIGNATURE_MARKERS = [ + re.compile(r"^\s*-{2,}\s*$", re.MULTILINE), + re.compile(r"^\s*—\s*$", re.MULTILINE), + re.compile(r"^\s*Best regards\b.*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Kind regards\b.*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Mit freundlichen Gr[üu]ßen\b.*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Viele Gr[üu]ße\b.*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Best,\s*$", re.MULTILINE | re.IGNORECASE), +] + + +def _htmlToText(html: str) -> str: + """Prefer BeautifulSoup when available, fall back to regex.""" + try: + from bs4 import BeautifulSoup # type: ignore + + soup = BeautifulSoup(html, "html.parser") + for tag in soup(["script", "style", "head"]): + tag.decompose() + for br in soup.find_all(["br"]): + br.replace_with("\n") + for p in soup.find_all(["p", "div", "li", "tr"]): + p.append("\n") + text = soup.get_text() + except Exception: + # Minimal fallback: strip tags crudely. + text = re.sub(r"", "\n", html, flags=re.IGNORECASE) + text = re.sub(r"", "\n", text, flags=re.IGNORECASE) + text = re.sub(r"<[^>]+>", "", text) + # Collapse non-breaking + zero-width whitespace. + text = text.replace("\u00a0", " ").replace("\u200b", "") + return text + + +def _stripQuotedThread(text: str) -> str: + """Remove reply-chain content so only the author's own contribution remains.""" + earliest = len(text) + for pattern in _QUOTE_MARKER_PATTERNS: + match = pattern.search(text) + if match and match.start() < earliest: + earliest = match.start() + # Drop any block starting with "> " quoted lines (often Gmail/Thunderbird). + quotedBlock = re.search(r"^(?:\s*>.*\n?)+", text, re.MULTILINE) + if quotedBlock and quotedBlock.start() < earliest: + earliest = quotedBlock.start() + return text[:earliest].rstrip() + + +def _stripSignature(text: str) -> str: + earliest = len(text) + for pattern in _SIGNATURE_MARKERS: + match = pattern.search(text) + if match and match.start() < earliest: + earliest = match.start() + return text[:earliest].rstrip() + + +def _collapseWhitespace(text: str) -> str: + text = re.sub(r"[ \t]+", " ", text) + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() + + +def cleanEmailBody(html: str, maxChars: Optional[int] = DEFAULT_MAX_CHARS) -> str: + """Return a compact plain-text view of an email body suitable for embedding. + + Steps: HTML → text, remove quoted reply chain, remove signature, collapse + whitespace, truncate to maxChars. Always returns a string (possibly empty). + """ + if not html: + return "" + text = _htmlToText(html) if "<" in html and ">" in html else html + text = _stripQuotedThread(text) + text = _stripSignature(text) + text = _collapseWhitespace(text) + if maxChars and len(text) > maxChars: + text = text[:maxChars].rstrip() + "…" + return text diff --git a/tests/unit/services/test_bootstrap_clickup.py b/tests/unit/services/test_bootstrap_clickup.py new file mode 100644 index 00000000..87c08c3d --- /dev/null +++ b/tests/unit/services/test_bootstrap_clickup.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Bootstrap ClickUp tests with a fake service + knowledge service. + +Verifies: +- Teams → spaces → lists (folderless + folder-based) → tasks traversal. +- Each task produces a `requestIngestion` call with `sourceKind="clickup_task"` + and header + description content-objects. +- `date_updated` is forwarded as contentVersion → idempotency. +- Recency filter drops tasks older than `maxAgeDays`. +- maxWorkspaces / maxListsPerWorkspace / maxTasks caps are respected. +""" + +import asyncio +import os +import sys +import time +from types import SimpleNamespace + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import ( + bootstrapClickup, + ClickupBootstrapLimits, + _syntheticTaskId, +) + + +def _nowMs(offsetDays: int = 0) -> str: + return str(int((time.time() + offsetDays * 86400) * 1000)) + + +class _FakeClickupService: + """Records API calls; serves a canned 1-team / 1-space / 1-list / 2-task layout.""" + + def __init__(self, taskCount=2, oldTask=False): + self._taskCount = taskCount + self._oldTask = oldTask # when True, the second task is 400 days old + self.calls = [] + + async def getAuthorizedTeams(self): + self.calls.append(("getAuthorizedTeams",)) + return {"teams": [{"id": "team-1", "name": "Acme"}]} + + async def getSpaces(self, team_id: str): + self.calls.append(("getSpaces", team_id)) + return {"spaces": [{"id": "space-1", "name": "Engineering"}]} + + async def getFolderlessLists(self, space_id: str): + self.calls.append(("getFolderlessLists", space_id)) + return {"lists": [{"id": "list-1", "name": "Sprint 1"}]} + + async def getFolders(self, space_id: str): + self.calls.append(("getFolders", space_id)) + return {"folders": [{"id": "folder-1", "name": "Subproject"}]} + + async def getListsInFolder(self, folder_id: str): + self.calls.append(("getListsInFolder", folder_id)) + return {"lists": [{"id": "list-2", "name": "Sub-tasks"}]} + + async def getTasksInList(self, list_id: str, *, page=0, include_closed=False, subtasks=True): + self.calls.append(("getTasksInList", list_id, page, include_closed)) + if page > 0: + return {"tasks": []} + tasks = [] + for i in range(self._taskCount): + tid = f"{list_id}-task-{i}" + offsetDays = -400 if (self._oldTask and i == 1) else 0 + tasks.append({ + "id": tid, + "name": f"Task {i} of {list_id}", + "description": f"Plain description for task {i}", + "text_content": f"Rich content for task {i}", + "status": {"status": "open" if i == 0 else "closed"}, + "assignees": [{"username": "alice"}], + "tags": [{"name": "urgent"}], + "date_updated": _nowMs(offsetDays), + "date_created": _nowMs(-1), + "url": f"https://app.clickup.com/t/{tid}", + }) + return {"tasks": tasks} + + +class _FakeKnowledgeService: + def __init__(self, duplicateIds=None): + self.calls = [] + self._duplicates = duplicateIds or set() + + async def requestIngestion(self, job): + self.calls.append(job) + status = "duplicate" if job.sourceId in self._duplicates else "indexed" + return SimpleNamespace( + jobId=job.sourceId, status=status, contentHash="h", + fileId=job.sourceId, index=None, error=None, + ) + + +def _adapter(svc): + return SimpleNamespace(_svc=svc) + + +def test_bootstrap_walks_team_space_lists_and_tasks(): + svc = _FakeClickupService(taskCount=2) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapClickup( + connectionId="c1", + adapter=_adapter(svc), + connection=connection, + knowledgeService=knowledge, + limits=ClickupBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + # 2 lists (folderless list-1 + folder's list-2) × 2 tasks each = 4 tasks + assert result["indexed"] == 4 + assert result["workspaces"] == 1 + assert result["lists"] == 2 + sourceIds = {c.sourceId for c in knowledge.calls} + assert len(sourceIds) == 4 + for job in knowledge.calls: + assert job.sourceKind == "clickup_task" + assert job.mimeType == "application/vnd.clickup.task+json" + assert job.mandateId == "m1" + assert job.provenance["connectionId"] == "c1" + assert job.provenance["authority"] == "clickup" + assert job.provenance["teamId"] == "team-1" + assert job.contentVersion # numeric millisecond string + # At least the header content-object is present. + ids = [co["contentObjectId"] for co in job.contentObjects] + assert "header" in ids + + +def test_bootstrap_reports_duplicates_on_second_run(): + svc = _FakeClickupService(taskCount=1) + duplicates = { + _syntheticTaskId("c1", "list-1-task-0"), + _syntheticTaskId("c1", "list-2-task-0"), + } + knowledge = _FakeKnowledgeService(duplicateIds=duplicates) + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapClickup( + connectionId="c1", + adapter=_adapter(svc), + connection=connection, + knowledgeService=knowledge, + limits=ClickupBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 0 + assert result["skippedDuplicate"] == 2 + + +def test_bootstrap_skips_tasks_older_than_maxAgeDays(): + svc = _FakeClickupService(taskCount=2, oldTask=True) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapClickup( + connectionId="c1", + adapter=_adapter(svc), + connection=connection, + knowledgeService=knowledge, + limits=ClickupBootstrapLimits(maxAgeDays=180), + ) + + result = asyncio.run(_run()) + # 2 lists × (1 recent + 1 skipped old) = 2 indexed + 2 skippedPolicy + assert result["indexed"] == 2 + assert result["skippedPolicy"] == 2 + + +def test_bootstrap_maxTasks_caps_ingestion(): + svc = _FakeClickupService(taskCount=2) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapClickup( + connectionId="c1", + adapter=_adapter(svc), + connection=connection, + knowledgeService=knowledge, + limits=ClickupBootstrapLimits(maxAgeDays=None, maxTasks=3), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 3 + + +if __name__ == "__main__": + test_bootstrap_walks_team_space_lists_and_tasks() + test_bootstrap_reports_duplicates_on_second_run() + test_bootstrap_skips_tasks_older_than_maxAgeDays() + test_bootstrap_maxTasks_caps_ingestion() + print("OK — bootstrapClickup tests passed") diff --git a/tests/unit/services/test_bootstrap_gdrive.py b/tests/unit/services/test_bootstrap_gdrive.py new file mode 100644 index 00000000..1b88677e --- /dev/null +++ b/tests/unit/services/test_bootstrap_gdrive.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Bootstrap Google Drive tests with a fake adapter + knowledge service. + +Verifies: +- Drive walk traverses root → subfolders, respecting `maxDepth`. +- Every file triggers `requestIngestion` with `sourceKind="gdrive_item"`. +- Duplicate runs (same modifiedTime revision) report `skippedDuplicate`. +- Provenance carries `authority="google"` and the Drive file id. +- Recency filter skips files older than `maxAgeDays`. +""" + +import asyncio +import os +import sys +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from types import SimpleNamespace +from typing import Any, Dict, List, Optional + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import ( + bootstrapGdrive, + GdriveBootstrapLimits, + _syntheticFileId, +) + + +@dataclass +class _ExtEntry: + name: str + path: str + isFolder: bool = False + size: Optional[int] = None + mimeType: Optional[str] = None + metadata: Dict[str, Any] = None + + +def _today_iso(offsetDays: int = 0) -> str: + return (datetime.now(timezone.utc) + timedelta(days=offsetDays)).strftime("%Y-%m-%dT%H:%M:%SZ") + + +class _FakeDriveAdapter: + """Minimal DriveAdapter stand-in. + + Layout: + "/" (root) → 2 files + 1 folder (sub) + "/sub_id" → 1 file + """ + + def __init__(self, recent_only: bool = True): + self.downloaded: List[str] = [] + self._recent = _today_iso(0) + self._old = _today_iso(-400) + self._recent_only = recent_only + + async def browse(self, path: str, filter=None, limit=None): + if path in ("/", "", "root"): + return [ + _ExtEntry( + name="f1.txt", path="/f1", size=20, + mimeType="text/plain", + metadata={"id": "f1", "modifiedTime": self._recent}, + ), + _ExtEntry( + name="f2.txt", path="/f2", size=20, + mimeType="text/plain", + metadata={"id": "f2", "modifiedTime": self._recent if self._recent_only else self._old}, + ), + _ExtEntry( + name="Subfolder", path="/sub_id", isFolder=True, + mimeType="application/vnd.google-apps.folder", + metadata={"id": "sub_id", "modifiedTime": self._recent}, + ), + ] + if path == "/sub_id": + return [ + _ExtEntry( + name="f3.txt", path="/f3", size=20, + mimeType="text/plain", + metadata={"id": "f3", "modifiedTime": self._recent}, + ), + ] + return [] + + async def download(self, path: str) -> bytes: + self.downloaded.append(path) + return path.encode("utf-8") + + +class _FakeKnowledgeService: + def __init__(self, duplicateIds=None): + self.calls: List[SimpleNamespace] = [] + self._duplicateIds = duplicateIds or set() + + async def requestIngestion(self, job): + self.calls.append(job) + status = "duplicate" if job.sourceId in self._duplicateIds else "indexed" + return SimpleNamespace( + jobId=f"{job.sourceKind}:{job.sourceId}", + status=status, contentHash="h", + fileId=job.sourceId, index=None, error=None, + ) + + +def _fakeRunExtraction(data, name, mime, options): + return SimpleNamespace( + parts=[ + SimpleNamespace( + id="p1", + data=data.decode("utf-8") if isinstance(data, bytes) else str(data), + typeGroup="text", + label="page:1", + metadata={"pageIndex": 0}, + ) + ] + ) + + +def test_bootstrap_walks_drive_and_subfolders(): + adapter = _FakeDriveAdapter() + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGdrive( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + limits=GdriveBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert len(knowledge.calls) == 3 + sourceIds = {c.sourceId for c in knowledge.calls} + assert sourceIds == { + _syntheticFileId("c1", "f1"), + _syntheticFileId("c1", "f2"), + _syntheticFileId("c1", "f3"), + } + assert result["indexed"] == 3 + assert result["skippedDuplicate"] == 0 + assert adapter.downloaded == ["/f1", "/f2", "/f3"] + + +def test_bootstrap_reports_duplicates_on_second_run(): + adapter = _FakeDriveAdapter() + duplicateIds = { + _syntheticFileId("c1", "f1"), + _syntheticFileId("c1", "f2"), + _syntheticFileId("c1", "f3"), + } + knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds) + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGdrive( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + limits=GdriveBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 0 + assert result["skippedDuplicate"] == 3 + + +def test_bootstrap_skips_files_older_than_maxAgeDays(): + adapter = _FakeDriveAdapter(recent_only=False) # f2 is 400 days old + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGdrive( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + limits=GdriveBootstrapLimits(maxAgeDays=180), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 2 # f1, f3 + assert result["skippedPolicy"] == 1 # f2 filtered out + + +def test_bootstrap_passes_connection_provenance(): + adapter = _FakeDriveAdapter() + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGdrive( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + limits=GdriveBootstrapLimits(maxAgeDays=None), + ) + + asyncio.run(_run()) + for job in knowledge.calls: + assert job.sourceKind == "gdrive_item" + assert job.mandateId == "m1" + assert job.provenance["connectionId"] == "c1" + assert job.provenance["authority"] == "google" + assert job.provenance["service"] == "drive" + assert job.contentVersion # modifiedTime ISO string + + +if __name__ == "__main__": + test_bootstrap_walks_drive_and_subfolders() + test_bootstrap_reports_duplicates_on_second_run() + test_bootstrap_skips_files_older_than_maxAgeDays() + test_bootstrap_passes_connection_provenance() + print("OK — bootstrapGdrive tests passed") diff --git a/tests/unit/services/test_bootstrap_gmail.py b/tests/unit/services/test_bootstrap_gmail.py new file mode 100644 index 00000000..4f7cfe4d --- /dev/null +++ b/tests/unit/services/test_bootstrap_gmail.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Bootstrap Gmail tests with a fake googleGet + knowledge service. + +Verifies: +- Default labels (INBOX + SENT) are traversed. +- Each message produces a requestIngestion call with sourceKind=gmail_message + and structured contentObjects (header / snippet / body). +- Pagination via `nextPageToken` is followed. +- historyId is forwarded as contentVersion → idempotency. +- MIME body extraction walks nested parts (multipart/alternative). +""" + +import asyncio +import base64 +import os +import sys +from types import SimpleNamespace + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import ( + bootstrapGmail, + GmailBootstrapLimits, + _syntheticMessageId, + _buildContentObjects, + _walkPayloadForBody, +) + + +def _b64url(text: str) -> str: + return base64.urlsafe_b64encode(text.encode("utf-8")).decode("ascii").rstrip("=") + + +def _msg(mid: str, subject: str = "Hi", body: str = "Hello world", historyId: str = "h1"): + return { + "id": mid, + "threadId": f"thread-{mid}", + "historyId": historyId, + "internalDate": "1700000000000", + "snippet": body[:120], + "payload": { + "headers": [ + {"name": "Subject", "value": subject}, + {"name": "From", "value": "Alice "}, + {"name": "To", "value": "Bob "}, + {"name": "Date", "value": "Tue, 01 Jan 2025 10:00:00 +0000"}, + ], + "mimeType": "text/plain", + "body": {"data": _b64url(body), "size": len(body)}, + "parts": [], + }, + } + + +class _FakeGoogleGet: + """Records URLs + returns the wired-up page or message response.""" + + def __init__(self, messages_by_label, paginated_label=None, page2=None): + self._messages = messages_by_label + self._paginated = paginated_label + self._page2 = page2 or [] + self._served_first_page = set() + self.requested = [] + + async def __call__(self, url: str): + self.requested.append(url) + # List page: contains `/users/me/messages?labelIds=...` + if "/users/me/messages?" in url: + for label, msgs in self._messages.items(): + if f"labelIds={label}" in url: + if ( + label == self._paginated + and label not in self._served_first_page + ): + self._served_first_page.add(label) + return { + "messages": [{"id": m["id"]} for m in msgs], + "nextPageToken": "token-2", + } + if label == self._paginated and "pageToken=token-2" in url: + return { + "messages": [{"id": m["id"]} for m in self._page2], + } + return {"messages": [{"id": m["id"]} for m in msgs]} + return {"messages": []} + # Detail fetch: /users/me/messages/{id}?format=full + if "/users/me/messages/" in url and "format=full" in url: + msgId = url.split("/users/me/messages/")[-1].split("?")[0] + for msgs in self._messages.values(): + for m in msgs: + if m["id"] == msgId: + return m + for m in self._page2: + if m["id"] == msgId: + return m + return {"error": "not found"} + + +class _FakeKnowledgeService: + def __init__(self, duplicateIds=None): + self.calls = [] + self._duplicates = duplicateIds or set() + + async def requestIngestion(self, job): + self.calls.append(job) + status = "duplicate" if job.sourceId in self._duplicates else "indexed" + return SimpleNamespace( + jobId=job.sourceId, status=status, contentHash="h", + fileId=job.sourceId, index=None, error=None, + ) + + +def test_buildContentObjects_emits_header_snippet_body(): + parts = _buildContentObjects(_msg("m1", body="Hello\nWorld"), maxBodyChars=8000) + ids = [p["contentObjectId"] for p in parts] + assert ids == ["header", "snippet", "body"] + header = parts[0]["data"] + assert "Subject: Hi" in header + assert "From: Alice " in header + assert "To: Bob " in header + + +def test_walkPayloadForBody_prefers_plain_over_html(): + payload = { + "mimeType": "multipart/alternative", + "parts": [ + {"mimeType": "text/plain", "body": {"data": _b64url("plain body")}}, + {"mimeType": "text/html", "body": {"data": _b64url("

html body

")}}, + ], + } + bodies = _walkPayloadForBody(payload) + assert bodies["text"] == "plain body" + assert bodies["html"] == "

html body

" + + +def test_walkPayloadForBody_falls_back_to_html(): + payload = { + "mimeType": "multipart/alternative", + "parts": [ + {"mimeType": "text/html", "body": {"data": _b64url("

only html

")}}, + ], + } + bodies = _walkPayloadForBody(payload) + assert bodies["text"] == "" + assert "only html" in bodies["html"] + + +def test_bootstrap_gmail_indexes_messages_from_inbox_and_sent(): + fake_get = _FakeGoogleGet({ + "INBOX": [_msg("m1"), _msg("m2")], + "SENT": [_msg("m3")], + }) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGmail( + connectionId="c1", + adapter=SimpleNamespace(_token="t"), + connection=connection, + knowledgeService=knowledge, + limits=GmailBootstrapLimits(maxAgeDays=None), + googleGetFn=fake_get, + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 3 + sourceIds = {c.sourceId for c in knowledge.calls} + assert sourceIds == { + _syntheticMessageId("c1", "m1"), + _syntheticMessageId("c1", "m2"), + _syntheticMessageId("c1", "m3"), + } + for job in knowledge.calls: + assert job.sourceKind == "gmail_message" + assert job.mimeType == "message/rfc822" + assert job.provenance["connectionId"] == "c1" + assert job.provenance["authority"] == "google" + assert job.provenance["service"] == "gmail" + assert job.contentVersion == "h1" + assert any(co["contentObjectId"] == "header" for co in job.contentObjects) + + +def test_bootstrap_gmail_follows_pagination(): + fake_get = _FakeGoogleGet( + messages_by_label={"INBOX": [_msg("m1")], "SENT": []}, + paginated_label="INBOX", + page2=[_msg("m2"), _msg("m3")], + ) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGmail( + connectionId="c1", + adapter=SimpleNamespace(_token="t"), + connection=connection, + knowledgeService=knowledge, + limits=GmailBootstrapLimits(maxAgeDays=None), + googleGetFn=fake_get, + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 3 + + +def test_bootstrap_gmail_reports_duplicates(): + fake_get = _FakeGoogleGet({"INBOX": [_msg("m1"), _msg("m2")], "SENT": []}) + duplicates = { + _syntheticMessageId("c1", "m1"), + _syntheticMessageId("c1", "m2"), + } + knowledge = _FakeKnowledgeService(duplicateIds=duplicates) + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGmail( + connectionId="c1", + adapter=SimpleNamespace(_token="t"), + connection=connection, + knowledgeService=knowledge, + limits=GmailBootstrapLimits(maxAgeDays=None), + googleGetFn=fake_get, + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 0 + assert result["skippedDuplicate"] == 2 + + +if __name__ == "__main__": + test_buildContentObjects_emits_header_snippet_body() + test_walkPayloadForBody_prefers_plain_over_html() + test_walkPayloadForBody_falls_back_to_html() + test_bootstrap_gmail_indexes_messages_from_inbox_and_sent() + test_bootstrap_gmail_follows_pagination() + test_bootstrap_gmail_reports_duplicates() + print("OK — bootstrapGmail tests passed") diff --git a/tests/unit/services/test_bootstrap_outlook.py b/tests/unit/services/test_bootstrap_outlook.py new file mode 100644 index 00000000..26664eaa --- /dev/null +++ b/tests/unit/services/test_bootstrap_outlook.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Bootstrap Outlook tests with a fake adapter + knowledge service. + +Verifies: +- Well-known folders (inbox, sentitems) are discovered via Graph. +- Each message produces a `requestIngestion` call with sourceKind=outlook_message + and structured contentObjects (header / snippet / body). +- Pagination via `@odata.nextLink` is followed. +- changeKey is forwarded as contentVersion → idempotency. +""" + +import asyncio +import os +import sys +from types import SimpleNamespace + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import ( + bootstrapOutlook, + OutlookBootstrapLimits, + _syntheticMessageId, + _buildContentObjects, +) + + +class _FakeOutlookAdapter: + def __init__(self, messages_by_folder, paginated_folder=None, page2=None): + self._folders = {"inbox": "INBOX-ID", "sentitems": "SENT-ID"} + self._messages = messages_by_folder + self._paginated_folder = paginated_folder + self._page2 = page2 or [] + self.requested_endpoints = [] + + async def _graphGet(self, endpoint: str): + self.requested_endpoints.append(endpoint) + if endpoint.startswith("me/mailFolders/") and "/messages" not in endpoint: + wellKnown = endpoint.split("/")[-1] + fid = self._folders.get(wellKnown) + if not fid: + return {"error": "not found"} + return {"id": fid, "displayName": wellKnown} + # message page request: e.g. me/mailFolders/INBOX-ID/messages?... + for fid, messages in self._messages.items(): + if f"me/mailFolders/{fid}/messages" in endpoint: + page = {"value": messages} + if fid == self._paginated_folder and "skiptoken" not in endpoint: + page["@odata.nextLink"] = ( + "https://graph.microsoft.com/v1.0/" + f"me/mailFolders/{fid}/messages?$skiptoken=abc" + ) + elif fid == self._paginated_folder and "skiptoken" in endpoint: + page = {"value": self._page2} + return page + return {"value": []} + + async def browse(self, path): + return [] + + +class _FakeKnowledgeService: + def __init__(self, duplicateIds=None): + self.calls = [] + self._duplicates = duplicateIds or set() + + async def requestIngestion(self, job): + self.calls.append(job) + status = "duplicate" if job.sourceId in self._duplicates else "indexed" + return SimpleNamespace( + jobId=job.sourceId, status=status, contentHash="h", + fileId=job.sourceId, index=None, error=None, + ) + + +def _msg(mid: str, subject: str = "Hi", change: str = "ck1"): + return { + "id": mid, + "subject": subject, + "from": {"emailAddress": {"name": "Alice", "address": "a@x.com"}}, + "toRecipients": [{"emailAddress": {"name": "Bob", "address": "b@x.com"}}], + "ccRecipients": [], + "receivedDateTime": "2025-01-01T10:00:00Z", + "bodyPreview": "Hello world", + "body": {"contentType": "text", "content": "Hello world\nThis is the body."}, + "internetMessageId": f"<{mid}@local>", + "hasAttachments": False, + "changeKey": change, + } + + +def test_buildContentObjects_emits_header_snippet_body(): + parts = _buildContentObjects(_msg("m1"), maxBodyChars=8000) + ids = [p["contentObjectId"] for p in parts] + assert ids == ["header", "snippet", "body"] + header = parts[0]["data"] + assert "Subject: Hi" in header + assert "From: Alice " in header + assert "To: Bob " in header + + +def test_bootstrap_outlook_indexes_messages_from_inbox_and_sent(): + adapter = _FakeOutlookAdapter({ + "INBOX-ID": [_msg("m1"), _msg("m2")], + "SENT-ID": [_msg("m3")], + }) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapOutlook( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + limits=OutlookBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 3 + sourceIds = {c.sourceId for c in knowledge.calls} + assert sourceIds == { + _syntheticMessageId("c1", "m1"), + _syntheticMessageId("c1", "m2"), + _syntheticMessageId("c1", "m3"), + } + for job in knowledge.calls: + assert job.sourceKind == "outlook_message" + assert job.mimeType == "message/rfc822" + assert job.provenance["connectionId"] == "c1" + assert job.provenance["service"] == "outlook" + assert job.contentVersion == "ck1" + assert any(co["contentObjectId"] == "header" for co in job.contentObjects) + + +def test_bootstrap_outlook_follows_pagination(): + adapter = _FakeOutlookAdapter( + messages_by_folder={"INBOX-ID": [_msg("m1")], "SENT-ID": []}, + paginated_folder="INBOX-ID", + page2=[_msg("m2"), _msg("m3")], + ) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapOutlook( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + limits=OutlookBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 3 + + +def test_bootstrap_outlook_reports_duplicates(): + adapter = _FakeOutlookAdapter({ + "INBOX-ID": [_msg("m1"), _msg("m2")], + "SENT-ID": [], + }) + duplicates = { + _syntheticMessageId("c1", "m1"), + _syntheticMessageId("c1", "m2"), + } + knowledge = _FakeKnowledgeService(duplicateIds=duplicates) + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapOutlook( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + limits=OutlookBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 0 + assert result["skippedDuplicate"] == 2 + + +if __name__ == "__main__": + test_buildContentObjects_emits_header_snippet_body() + test_bootstrap_outlook_indexes_messages_from_inbox_and_sent() + test_bootstrap_outlook_follows_pagination() + test_bootstrap_outlook_reports_duplicates() + print("OK — bootstrapOutlook tests passed") diff --git a/tests/unit/services/test_bootstrap_sharepoint.py b/tests/unit/services/test_bootstrap_sharepoint.py new file mode 100644 index 00000000..8b011357 --- /dev/null +++ b/tests/unit/services/test_bootstrap_sharepoint.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Bootstrap SharePoint tests with a fake adapter + knowledge service. + +Verifies: +- Every discovered file triggers `requestIngestion`. +- Duplicate runs (same eTag revisions) report `skippedDuplicate`. +- Synthetic fileIds are stable across runs so idempotency works end-to-end. +""" + +import asyncio +import os +import sys +from dataclasses import dataclass +from types import SimpleNamespace +from typing import Any, Dict, List, Optional + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import ( + bootstrapSharepoint, + _syntheticFileId, +) + + +@dataclass +class _ExtEntry: + name: str + path: str + isFolder: bool = False + size: Optional[int] = None + mimeType: Optional[str] = None + metadata: Dict[str, Any] = None + + +class _FakeSpAdapter: + """Minimal SharepointAdapter stand-in. + + Layout: + "/" → 1 site + "/sites/site-1" → 2 files (f1, f2) + 1 folder (sub) + "/sites/site-1/sub" → 1 file (f3) + """ + + def __init__(self): + self.downloaded: List[str] = [] + + async def browse(self, path: str, filter=None, limit=None): + if path == "/": + return [ + _ExtEntry( + name="Site 1", + path="/sites/site-1", + isFolder=True, + metadata={"id": "site-1"}, + ), + ] + if path == "/sites/site-1": + return [ + _ExtEntry( + name="f1.txt", path="/sites/site-1/f1.txt", + mimeType="text/plain", size=20, + metadata={"id": "f1", "revision": "etag-f1"}, + ), + _ExtEntry( + name="f2.txt", path="/sites/site-1/f2.txt", + mimeType="text/plain", size=20, + metadata={"id": "f2", "revision": "etag-f2"}, + ), + _ExtEntry( + name="sub", path="/sites/site-1/sub", + isFolder=True, metadata={"id": "sub"}, + ), + ] + if path == "/sites/site-1/sub": + return [ + _ExtEntry( + name="f3.txt", path="/sites/site-1/sub/f3.txt", + mimeType="text/plain", size=20, + metadata={"id": "f3", "revision": "etag-f3"}, + ), + ] + return [] + + async def download(self, path: str) -> bytes: + self.downloaded.append(path) + return path.encode("utf-8") + + +class _FakeKnowledgeService: + """Records requestIngestion calls and returns the scripted handles.""" + + def __init__(self, duplicateIds=None): + self.calls: List[SimpleNamespace] = [] + self._duplicateIds = duplicateIds or set() + + async def requestIngestion(self, job): + self.calls.append(job) + status = "duplicate" if job.sourceId in self._duplicateIds else "indexed" + return SimpleNamespace( + jobId=f"{job.sourceKind}:{job.sourceId}", + status=status, + contentHash="h", + fileId=job.sourceId, + index=None, + error=None, + ) + + +def _fakeRunExtraction(data, name, mime, options): + """Produce a single synthetic text part so `_toContentObjects` returns one.""" + return SimpleNamespace( + parts=[ + SimpleNamespace( + id="p1", + data=data.decode("utf-8") if isinstance(data, bytes) else str(data), + typeGroup="text", + label="page:1", + metadata={"pageIndex": 0}, + ) + ] + ) + + +def test_bootstrap_walks_sites_and_subfolders(): + adapter = _FakeSpAdapter() + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapSharepoint( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + ) + + result = asyncio.run(_run()) + assert len(knowledge.calls) == 3 + sourceIds = {c.sourceId for c in knowledge.calls} + assert sourceIds == { + _syntheticFileId("c1", "f1"), + _syntheticFileId("c1", "f2"), + _syntheticFileId("c1", "f3"), + } + assert result["indexed"] == 3 + assert result["skippedDuplicate"] == 0 + assert adapter.downloaded == [ + "/sites/site-1/f1.txt", + "/sites/site-1/f2.txt", + "/sites/site-1/sub/f3.txt", + ] + + +def test_bootstrap_reports_duplicates_on_second_run(): + adapter = _FakeSpAdapter() + duplicateIds = { + _syntheticFileId("c1", "f1"), + _syntheticFileId("c1", "f2"), + _syntheticFileId("c1", "f3"), + } + knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds) + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapSharepoint( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 0 + assert result["skippedDuplicate"] == 3 + + +def test_bootstrap_passes_connection_provenance(): + adapter = _FakeSpAdapter() + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapSharepoint( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + ) + + asyncio.run(_run()) + for job in knowledge.calls: + assert job.sourceKind == "sharepoint_item" + assert job.mandateId == "m1" + assert job.provenance["connectionId"] == "c1" + assert job.provenance["authority"] == "msft" + assert job.provenance["service"] == "sharepoint" + assert job.contentVersion and job.contentVersion.startswith("etag-") + + +if __name__ == "__main__": + test_bootstrap_walks_sites_and_subfolders() + test_bootstrap_reports_duplicates_on_second_run() + test_bootstrap_passes_connection_provenance() + print("OK — bootstrapSharepoint tests passed") diff --git a/tests/unit/services/test_clean_email_body.py b/tests/unit/services/test_clean_email_body.py new file mode 100644 index 00000000..a3ee01df --- /dev/null +++ b/tests/unit/services/test_clean_email_body.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Unit tests for cleanEmailBody. + +Covers: HTML→text normalisation, quoted-reply removal, signature removal, +whitespace collapse and truncation. The utility is used during Outlook +bootstrap; buggy cleaning would leak quoted threads / signatures into every +embedding. +""" + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subTextClean import ( + cleanEmailBody, +) + + +def test_strips_html_tags_and_scripts(): + html = ( + "" + "

Hello world

" + "" + ) + cleaned = cleanEmailBody(html) + assert "Hello" in cleaned + assert "world" in cleaned + assert "<" not in cleaned + assert "alert" not in cleaned + + +def test_strips_quoted_reply_english(): + body = ( + "Actual answer from me.\n\n" + "On Mon, 1 Jan 2024 at 10:00, Someone wrote:\n" + "> Original question?\n" + "> Second line.\n" + ) + cleaned = cleanEmailBody(body) + assert "Actual answer" in cleaned + assert "Original question" not in cleaned + assert "wrote:" not in cleaned + + +def test_strips_quoted_reply_german(): + body = ( + "Meine Antwort.\n\n" + "Am 1. Januar 2024 um 10:00 schrieb Max Muster :\n" + "> Ursprüngliche Frage?\n" + ) + cleaned = cleanEmailBody(body) + assert "Meine Antwort" in cleaned + assert "Ursprüngliche Frage" not in cleaned + + +def test_strips_signature_after_dashes(): + body = ( + "Kurze Nachricht.\n" + "\n" + "--\n" + "Max Muster\n" + "Vorstand, Beispiel GmbH\n" + ) + cleaned = cleanEmailBody(body) + assert "Kurze Nachricht" in cleaned + assert "Beispiel GmbH" not in cleaned + + +def test_strips_signature_salutation_de(): + body = ( + "Die eigentliche Information steht hier.\n\n" + "Mit freundlichen Grüßen\n" + "Max Muster" + ) + cleaned = cleanEmailBody(body) + assert "eigentliche Information" in cleaned + assert "Max Muster" not in cleaned + + +def test_truncate_to_max_chars(): + body = "abc " * 5000 + cleaned = cleanEmailBody(body, maxChars=200) + assert len(cleaned) <= 201 # includes trailing ellipsis + + +def test_empty_input_returns_empty_string(): + assert cleanEmailBody("") == "" + assert cleanEmailBody(None) == "" # type: ignore[arg-type] + + +def test_collapses_whitespace(): + body = "A lot of spaces\n\n\n\nand blank lines" + cleaned = cleanEmailBody(body) + assert " " not in cleaned + assert "\n\n\n" not in cleaned + + +if __name__ == "__main__": + test_strips_html_tags_and_scripts() + test_strips_quoted_reply_english() + test_strips_quoted_reply_german() + test_strips_signature_after_dashes() + test_strips_signature_salutation_de() + test_truncate_to_max_chars() + test_empty_input_returns_empty_string() + test_collapses_whitespace() + print("OK — cleanEmailBody tests passed") diff --git a/tests/unit/services/test_connection_purge.py b/tests/unit/services/test_connection_purge.py new file mode 100644 index 00000000..c32cb5b3 --- /dev/null +++ b/tests/unit/services/test_connection_purge.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Purge tests for KnowledgeObjects.deleteFileContentIndexByConnectionId. + +Ensures that a `connection.revoked` event wipes every FileContentIndex + chunk +linked to the given connectionId while leaving entries from other connections +(or upload-files with connectionId=None) intact. +""" + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.datamodels.datamodelKnowledge import FileContentIndex, ContentChunk +from modules.interfaces.interfaceDbKnowledge import KnowledgeObjects + + +class _FakeDb: + """Minimal in-memory stand-in for ``KnowledgeObjects.db``. + + Supports just the subset of APIs that deleteFileContentIndexByConnectionId + touches: getRecordset(FileContentIndex|ContentChunk, ...) + recordDelete. + """ + + def __init__(self): + self.indexRows: dict = {} + self.chunks: dict = {} + + def addIndex(self, row: dict) -> None: + self.indexRows[row["id"]] = row + + def addChunk(self, row: dict) -> None: + self.chunks[row["id"]] = row + + def getRecordset(self, modelClass, recordFilter=None, **_): + filter_ = recordFilter or {} + if modelClass is FileContentIndex: + rows = list(self.indexRows.values()) + elif modelClass is ContentChunk: + rows = list(self.chunks.values()) + else: + return [] + + def match(row): + for k, v in filter_.items(): + if row.get(k) != v: + return False + return True + + return [r for r in rows if match(r)] + + def recordDelete(self, modelClass, recordId): + if modelClass is FileContentIndex: + return self.indexRows.pop(recordId, None) is not None + if modelClass is ContentChunk: + return self.chunks.pop(recordId, None) is not None + return False + + +def _buildKnowledge(): + """Instantiate KnowledgeObjects without triggering the real DB bootstrap.""" + ko = KnowledgeObjects.__new__(KnowledgeObjects) + ko.currentUser = None + ko.userId = None + ko._scopeCache = {} + ko.db = _FakeDb() + return ko + + +def test_purge_by_connection_removes_only_matching_rows(): + ko = _buildKnowledge() + ko.db.addIndex({"id": "sp1", "connectionId": "cx", "mandateId": "m1", "sourceKind": "sharepoint_item"}) + ko.db.addIndex({"id": "sp2", "connectionId": "cx", "mandateId": "m1", "sourceKind": "sharepoint_item"}) + ko.db.addIndex({"id": "upload", "connectionId": None, "mandateId": "m1", "sourceKind": "file"}) + ko.db.addIndex({"id": "other", "connectionId": "cy", "mandateId": "m1", "sourceKind": "outlook_message"}) + ko.db.addChunk({"id": "c1", "fileId": "sp1"}) + ko.db.addChunk({"id": "c2", "fileId": "sp1"}) + ko.db.addChunk({"id": "c3", "fileId": "sp2"}) + ko.db.addChunk({"id": "c4", "fileId": "upload"}) + ko.db.addChunk({"id": "c5", "fileId": "other"}) + + result = ko.deleteFileContentIndexByConnectionId("cx") + + assert result == {"indexRows": 2, "chunks": 3} + assert "sp1" not in ko.db.indexRows + assert "sp2" not in ko.db.indexRows + assert "upload" in ko.db.indexRows + assert "other" in ko.db.indexRows + assert set(ko.db.chunks.keys()) == {"c4", "c5"} + + +def test_purge_with_empty_connection_id_is_a_noop(): + ko = _buildKnowledge() + ko.db.addIndex({"id": "sp1", "connectionId": "cx"}) + ko.db.addChunk({"id": "c1", "fileId": "sp1"}) + + result = ko.deleteFileContentIndexByConnectionId("") + + assert result == {"indexRows": 0, "chunks": 0} + assert "sp1" in ko.db.indexRows + + +def test_purge_unknown_connection_returns_zero(): + ko = _buildKnowledge() + ko.db.addIndex({"id": "sp1", "connectionId": "cx"}) + + result = ko.deleteFileContentIndexByConnectionId("nope") + + assert result == {"indexRows": 0, "chunks": 0} + assert "sp1" in ko.db.indexRows + + +if __name__ == "__main__": + test_purge_by_connection_removes_only_matching_rows() + test_purge_with_empty_connection_id_is_a_noop() + test_purge_unknown_connection_returns_zero() + print("OK — connection-purge tests passed") diff --git a/tests/unit/services/test_extraction_merge_strategy.py b/tests/unit/services/test_extraction_merge_strategy.py new file mode 100644 index 00000000..784bb783 --- /dev/null +++ b/tests/unit/services/test_extraction_merge_strategy.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Test that runExtraction preserves per-part granularity when mergeStrategy=None. + +The default MergeStrategy concatenates all text parts into a single ContentPart, which +collapses multi-page documents into one blob. This destroys RAG retrieval because every +document ends up as a single ContentChunk with a "blurred average" embedding. + +Ingestion pipelines (requestIngestion callers) MUST pass mergeStrategy=None to preserve +per-page / per-section chunks. +""" + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.datamodels.datamodelExtraction import ( + ContentPart, + ExtractionOptions, + MergeStrategy, +) +from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction +from modules.serviceCenter.services.serviceExtraction.subRegistry import ( + ChunkerRegistry, + Extractor, + ExtractorRegistry, +) + + +class _FakeMultiPagePdfExtractor(Extractor): + """Emits one text ContentPart per simulated page.""" + + def __init__(self, pageCount: int = 10): + self.pageCount = pageCount + + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: + return mimeType == "application/pdf" + + def getSupportedExtensions(self): + return [".pdf"] + + def getSupportedMimeTypes(self): + return ["application/pdf"] + + def extract(self, fileBytes: bytes, context): + return [ + ContentPart( + id=f"page-{i}", + parentId=None, + label=f"page_{i + 1}", + typeGroup="text", + mimeType="text/plain", + data=f"Page {i + 1} content — distinct semantic anchor #{i}", + metadata={"pageIndex": i, "size": 64}, + ) + for i in range(self.pageCount) + ] + + +def _buildRegistry(pageCount: int) -> ExtractorRegistry: + registry = ExtractorRegistry() + fake = _FakeMultiPagePdfExtractor(pageCount) + registry.register("application/pdf", fake) + registry.register("pdf", fake) + return registry + + +def test_default_options_merge_all_text_parts_into_one(): + """Regression safeguard: default ExtractionOptions still merges (legacy behaviour). + + Non-ingestion callers (AI processing, summarization) rely on this default. + """ + registry = _buildRegistry(pageCount=5) + extracted = runExtraction( + registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", + ExtractionOptions(), + ) + textParts = [p for p in extracted.parts if p.typeGroup == "text"] + assert len(textParts) == 1, ( + f"Default options should merge all text parts into one, got {len(textParts)}" + ) + assert "Page 1" in textParts[0].data and "Page 5" in textParts[0].data, ( + "Merged text should contain content from all pages" + ) + print("test_default_options_merge_all_text_parts_into_one [PASS]") + + +def test_merge_none_preserves_all_text_parts(): + """Core fix: mergeStrategy=None preserves per-page granularity for RAG ingestion.""" + registry = _buildRegistry(pageCount=500) + extracted = runExtraction( + registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", + ExtractionOptions(mergeStrategy=None), + ) + textParts = [p for p in extracted.parts if p.typeGroup == "text"] + assert len(textParts) == 500, ( + f"mergeStrategy=None should preserve all 500 text parts, got {len(textParts)}" + ) + assert textParts[0].label == "page_1" + assert textParts[-1].label == "page_500" + print("test_merge_none_preserves_all_text_parts [PASS]") + + +def test_explicit_merge_strategy_still_merges(): + """Callers can still opt in to merging by passing an explicit MergeStrategy.""" + registry = _buildRegistry(pageCount=3) + extracted = runExtraction( + registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", + ExtractionOptions(mergeStrategy=MergeStrategy()), + ) + textParts = [p for p in extracted.parts if p.typeGroup == "text"] + assert len(textParts) == 1, ( + f"Explicit MergeStrategy should merge, got {len(textParts)} parts" + ) + print("test_explicit_merge_strategy_still_merges [PASS]") + + +if __name__ == "__main__": + test_default_options_merge_all_text_parts_into_one() + test_merge_none_preserves_all_text_parts() + test_explicit_merge_strategy_still_merges() + print("\nAll merge-strategy tests passed.") diff --git a/tests/unit/services/test_ingestion_hash_stability.py b/tests/unit/services/test_ingestion_hash_stability.py new file mode 100644 index 00000000..df25a4f0 --- /dev/null +++ b/tests/unit/services/test_ingestion_hash_stability.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Test that _computeIngestionHash is stable across re-extractions of the same source. + +Extractors generate fresh contentObjectIds (uuid.uuid4()) per run. The ingestion +hash MUST therefore be derived from content (contentType + data + order) only — +otherwise idempotency (AC4) silently fails: every re-extraction looks "new" and +triggers full re-embedding. +""" + +import os +import sys +import uuid + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import ( + _computeIngestionHash, +) + + +def _makeObjects(seed: str = "alpha"): + """Build a synthetic contentObjects list as routeDataFiles._autoIndexFile would.""" + return [ + { + "contentObjectId": str(uuid.uuid4()), + "contentType": "text", + "data": f"Page 1 of {seed}", + }, + { + "contentObjectId": str(uuid.uuid4()), + "contentType": "text", + "data": f"Page 2 of {seed}", + }, + { + "contentObjectId": str(uuid.uuid4()), + "contentType": "binary", + "data": "", + }, + ] + + +def test_hash_stable_across_uuid_regeneration(): + """Same content + different contentObjectIds → same hash.""" + a = _makeObjects("alpha") + b = _makeObjects("alpha") # identical data, fresh UUIDs + assert [o["contentObjectId"] for o in a] != [o["contentObjectId"] for o in b] + assert _computeIngestionHash(a) == _computeIngestionHash(b) + + +def test_hash_changes_when_data_changes(): + a = _makeObjects("alpha") + b = _makeObjects("beta") + assert _computeIngestionHash(a) != _computeIngestionHash(b) + + +def test_hash_is_order_sensitive(): + """Reordered pages produce a different hash (different document).""" + a = _makeObjects("alpha") + b = list(reversed(a)) + assert _computeIngestionHash(a) != _computeIngestionHash(b) + + +def test_hash_distinguishes_text_vs_binary_with_same_payload(): + a = [{"contentObjectId": "x", "contentType": "text", "data": "hello"}] + b = [{"contentObjectId": "x", "contentType": "binary", "data": "hello"}] + assert _computeIngestionHash(a) != _computeIngestionHash(b) + + +def test_hash_handles_empty_input(): + assert _computeIngestionHash([]) == _computeIngestionHash([]) + + +if __name__ == "__main__": + test_hash_stable_across_uuid_regeneration() + test_hash_changes_when_data_changes() + test_hash_is_order_sensitive() + test_hash_distinguishes_text_vs_binary_with_same_payload() + test_hash_handles_empty_input() + print("OK — all 5 ingestion-hash stability tests passed") diff --git a/tests/unit/services/test_knowledge_ingest_consumer.py b/tests/unit/services/test_knowledge_ingest_consumer.py new file mode 100644 index 00000000..6b27a6e8 --- /dev/null +++ b/tests/unit/services/test_knowledge_ingest_consumer.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Unit tests for KnowledgeIngestionConsumer event dispatch. + +- `connection.established` → enqueue a `connection.bootstrap` job. +- `connection.revoked` → synchronous purge via KnowledgeObjects. +""" + +import asyncio +import os +import sys +import types + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as consumer + + +def _resetRegistration(monkeypatch): + """Force the module-level guard to register fresh in each test.""" + monkeypatch.setattr(consumer, "_registered", False) + + +def test_onConnectionEstablished_enqueues_bootstrap(monkeypatch): + startedJobs = [] + + async def _fakeStartJob(jobType, payload, **kwargs): + startedJobs.append({"jobType": jobType, "payload": payload, "kwargs": kwargs}) + return "job-1" + + monkeypatch.setattr(consumer, "startJob", _fakeStartJob) + consumer._onConnectionEstablished( + connectionId="c1", authority="msft", userId="u1" + ) + # Drain pending tasks created by the consumer. + loop = asyncio.new_event_loop() + try: + asyncio.set_event_loop(loop) + # If the consumer created a Task on a closed loop the fake startJob + # was still called synchronously via asyncio.run — in either case we + # check the recorded call. + finally: + loop.close() + + assert len(startedJobs) == 1 + assert startedJobs[0]["jobType"] == consumer.BOOTSTRAP_JOB_TYPE + assert startedJobs[0]["payload"]["connectionId"] == "c1" + assert startedJobs[0]["payload"]["authority"] == "msft" + assert startedJobs[0]["kwargs"]["triggeredBy"] == "u1" + + +def test_onConnectionEstablished_ignores_missing_id(monkeypatch): + called = [] + + async def _fakeStartJob(*a, **kw): + called.append(1) + return "x" + + monkeypatch.setattr(consumer, "startJob", _fakeStartJob) + consumer._onConnectionEstablished(connectionId="", authority="msft") + assert called == [] + + +def test_onConnectionRevoked_runs_sync_purge(monkeypatch): + class _FakeKnowledge: + def __init__(self): + self.calls = [] + + def deleteFileContentIndexByConnectionId(self, cid): + self.calls.append(cid) + return {"indexRows": 2, "chunks": 5} + + fakeKnow = _FakeKnowledge() + + def _fakeGetInterface(_user=None): + return fakeKnow + + monkeypatch.setattr(consumer, "getKnowledgeInterface", _fakeGetInterface) + consumer._onConnectionRevoked( + connectionId="c1", authority="msft", userId="u1", reason="disconnected" + ) + assert fakeKnow.calls == ["c1"] + + +def test_onConnectionRevoked_ignores_missing_id(monkeypatch): + seen = [] + + def _fakeGetInterface(_user=None): + class _K: + def deleteFileContentIndexByConnectionId(self, cid): + seen.append(cid) + return {"indexRows": 0, "chunks": 0} + + return _K() + + monkeypatch.setattr(consumer, "getKnowledgeInterface", _fakeGetInterface) + consumer._onConnectionRevoked(connectionId="") + assert seen == [] + + +def test_bootstrap_job_skips_unsupported_authority(monkeypatch): + async def _run(): + result = await consumer._bootstrapJobHandler( + {"payload": {"connectionId": "c1", "authority": "slack"}}, + lambda *_: None, + ) + return result + + result = asyncio.run(_run()) + assert result["skipped"] is True + assert result["authority"] == "slack" + assert result["reason"] == "unsupported_authority" + + +def test_bootstrap_job_dispatches_msft_parts(monkeypatch): + calls = {"sp": 0, "ol": 0} + + async def _fakeSp(connectionId, progressCb=None): + calls["sp"] += 1 + return {"indexed": 1} + + async def _fakeOl(connectionId, progressCb=None): + calls["ol"] += 1 + return {"indexed": 2} + + fakeSharepoint = types.ModuleType("subConnectorSyncSharepoint") + fakeSharepoint.bootstrapSharepoint = _fakeSp + fakeOutlook = types.ModuleType("subConnectorSyncOutlook") + fakeOutlook.bootstrapOutlook = _fakeOl + monkeypatch.setitem( + sys.modules, + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint", + fakeSharepoint, + ) + monkeypatch.setitem( + sys.modules, + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook", + fakeOutlook, + ) + + async def _run(): + return await consumer._bootstrapJobHandler( + {"payload": {"connectionId": "c1", "authority": "msft"}}, + lambda *_: None, + ) + + result = asyncio.run(_run()) + assert calls == {"sp": 1, "ol": 1} + assert result["sharepoint"] == {"indexed": 1} + assert result["outlook"] == {"indexed": 2} + + +def test_bootstrap_job_dispatches_google_parts(monkeypatch): + calls = {"gd": 0, "gm": 0} + + async def _fakeGd(connectionId, progressCb=None): + calls["gd"] += 1 + return {"indexed": 7} + + async def _fakeGm(connectionId, progressCb=None): + calls["gm"] += 1 + return {"indexed": 11} + + fakeGdrive = types.ModuleType("subConnectorSyncGdrive") + fakeGdrive.bootstrapGdrive = _fakeGd + fakeGmail = types.ModuleType("subConnectorSyncGmail") + fakeGmail.bootstrapGmail = _fakeGm + monkeypatch.setitem( + sys.modules, + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive", + fakeGdrive, + ) + monkeypatch.setitem( + sys.modules, + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail", + fakeGmail, + ) + + async def _run(): + return await consumer._bootstrapJobHandler( + {"payload": {"connectionId": "c1", "authority": "google"}}, + lambda *_: None, + ) + + result = asyncio.run(_run()) + assert calls == {"gd": 1, "gm": 1} + assert result["drive"] == {"indexed": 7} + assert result["gmail"] == {"indexed": 11} + + +def test_bootstrap_job_dispatches_clickup_part(monkeypatch): + calls = {"cu": 0} + + async def _fakeCu(connectionId, progressCb=None): + calls["cu"] += 1 + return {"indexed": 4} + + fakeClickup = types.ModuleType("subConnectorSyncClickup") + fakeClickup.bootstrapClickup = _fakeCu + monkeypatch.setitem( + sys.modules, + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup", + fakeClickup, + ) + + async def _run(): + return await consumer._bootstrapJobHandler( + {"payload": {"connectionId": "c1", "authority": "clickup"}}, + lambda *_: None, + ) + + result = asyncio.run(_run()) + assert calls == {"cu": 1} + assert result["clickup"] == {"indexed": 4} + + +if __name__ == "__main__": + # Usable without pytest fixtures for a quick smoke run. + class _MP: + def __init__(self): + self.undos = [] + + def setattr(self, target, name_or_value, value=None): + if value is None: + # target is an object, name_or_value is value → no, original signature + raise SystemExit("use pytest monkeypatch in CLI") + self.undos.append((target, name_or_value, getattr(target, name_or_value))) + setattr(target, name_or_value, value) + + def setitem(self, mapping, key, value): + self.undos.append((mapping, key, mapping.get(key))) + mapping[key] = value + + print("Run via pytest: pytest tests/unit/services/test_knowledge_ingest_consumer.py") diff --git a/tests/unit/services/test_p1d_consent_prefs.py b/tests/unit/services/test_p1d_consent_prefs.py new file mode 100644 index 00000000..e00b0dfc --- /dev/null +++ b/tests/unit/services/test_p1d_consent_prefs.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +"""Unit tests for P1d: consent gating, preference parsing, and walker behaviour. + +Tests +----- +1. Bootstrap runner skips when ``knowledgeIngestionEnabled=False``. +2. ``loadConnectionPrefs`` returns safe defaults when preferences are absent. +3. ``loadConnectionPrefs`` maps all §2.6 keys correctly from a full prefs dict. +4. Gmail walker passes ``neutralize=True`` and ``mailContentDepth`` to IngestionJob. +5. Gmail walker produces only a header content-object when depth="metadata". +6. ClickUp walker skips description when scope="titles". +""" + +from __future__ import annotations + +import asyncio +import os +import sys +import types +import unittest +from typing import Any, Dict, Optional +from unittest.mock import AsyncMock, MagicMock, patch + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + + +# --------------------------------------------------------------------------- +# 1. Bootstrap runner consent gate +# --------------------------------------------------------------------------- + +class TestBootstrapConsentGate(unittest.TestCase): + """_bootstrapJobHandler must no-op when knowledgeIngestionEnabled is False.""" + + def _makeJob(self, connectionId="c-test", authority="google"): + return {"payload": {"connectionId": connectionId, "authority": authority}} + + def _makeConn(self, enabled: bool): + conn = MagicMock() + conn.knowledgeIngestionEnabled = enabled + return conn + + def test_skips_when_consent_disabled(self): + from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as sut + + fake_root = MagicMock() + fake_root.getUserConnectionById.return_value = self._makeConn(False) + + with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=fake_root): + result = asyncio.get_event_loop().run_until_complete( + sut._bootstrapJobHandler(self._makeJob(), lambda *a: None) + ) + + assert result.get("skipped") is True + assert result.get("reason") == "consent_disabled" + fake_root.getUserConnectionById.assert_called_once_with("c-test") + + def test_proceeds_when_consent_enabled(self): + """When consent is enabled, the handler should call at least one walker.""" + from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as sut + + fake_root = MagicMock() + fake_root.getUserConnectionById.return_value = self._makeConn(True) + + # Patch the inner walker so it doesn't do real I/O. + async def _fakeBootstrap(**kwargs): + return {"indexed": 0} + + with ( + patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=fake_root), + patch( + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive.bootstrapGdrive", + new=AsyncMock(return_value={"indexed": 0}), + ), + patch( + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail.bootstrapGmail", + new=AsyncMock(return_value={"indexed": 0}), + ), + ): + result = asyncio.get_event_loop().run_until_complete( + sut._bootstrapJobHandler(self._makeJob(authority="google"), lambda *a: None) + ) + + # Should not have 'skipped' at the top level. + assert result.get("skipped") is not True + assert result.get("authority") == "google" + + +# --------------------------------------------------------------------------- +# 2 + 3. loadConnectionPrefs +# --------------------------------------------------------------------------- + +class TestLoadConnectionPrefs(unittest.TestCase): + def _makeConn(self, prefs: Optional[Dict[str, Any]]): + conn = MagicMock() + conn.knowledgePreferences = prefs + return conn + + def _mockRoot(self, prefs): + root = MagicMock() + root.getUserConnectionById.return_value = self._makeConn(prefs) + return root + + def test_returns_safe_defaults_when_prefs_none(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import ( + ConnectionIngestionPrefs, + loadConnectionPrefs, + ) + + with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(None)): + prefs = loadConnectionPrefs("x") + + assert prefs.neutralizeBeforeEmbed is False + assert prefs.mailContentDepth == "full" + assert prefs.mailIndexAttachments is False + assert prefs.maxAgeDays == 90 + assert prefs.clickupScope == "title_description" + assert prefs.gmailEnabled is True + assert prefs.driveEnabled is True + + def test_maps_all_keys(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + + raw = { + "neutralizeBeforeEmbed": True, + "mailContentDepth": "metadata", + "mailIndexAttachments": True, + "filesIndexBinaries": False, + "clickupScope": "with_comments", + "maxAgeDays": 30, + "surfaceToggles": { + "google": {"gmail": False, "drive": True}, + "msft": {"sharepoint": False, "outlook": True}, + }, + } + + with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(raw)): + prefs = loadConnectionPrefs("x") + + assert prefs.neutralizeBeforeEmbed is True + assert prefs.mailContentDepth == "metadata" + assert prefs.mailIndexAttachments is True + assert prefs.filesIndexBinaries is False + assert prefs.clickupScope == "with_comments" + assert prefs.maxAgeDays == 30 + assert prefs.gmailEnabled is False + assert prefs.driveEnabled is True + assert prefs.sharepointEnabled is False + assert prefs.outlookEnabled is True + + def test_invalid_depth_falls_back_to_default(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + + raw = {"mailContentDepth": "everything_please"} + + with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(raw)): + prefs = loadConnectionPrefs("x") + + assert prefs.mailContentDepth == "full" + + +# --------------------------------------------------------------------------- +# 4. Gmail walker passes neutralize + mailContentDepth to IngestionJob +# --------------------------------------------------------------------------- + +class TestGmailWalkerPrefs(unittest.TestCase): + def _make_message(self, *, subject="Test", snippet="hello", body_text="full body"): + import base64 + encoded = base64.urlsafe_b64encode(body_text.encode()).decode() + return { + "id": "msg-1", + "historyId": "h-42", + "threadId": "t-1", + "snippet": snippet, + "payload": { + "mimeType": "multipart/alternative", + "headers": [ + {"name": "Subject", "value": subject}, + {"name": "From", "value": "alice@example.com"}, + {"name": "To", "value": "bob@example.com"}, + {"name": "Date", "value": "Mon, 20 Apr 2026 10:00:00 +0000"}, + ], + "parts": [ + { + "mimeType": "text/plain", + "body": {"data": encoded}, + } + ], + }, + } + + def test_neutralize_flag_forwarded(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import ( + GmailBootstrapLimits, + _ingestMessage, + GmailBootstrapResult, + ) + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + captured_jobs = [] + + async def fake_requestIngestion(job: IngestionJob): + captured_jobs.append(job) + return MagicMock(status="indexed", error=None) + + ks = MagicMock() + ks.requestIngestion = fake_requestIngestion + + limits = GmailBootstrapLimits(neutralize=True, mailContentDepth="full") + result = GmailBootstrapResult(connectionId="c-1") + + asyncio.get_event_loop().run_until_complete( + _ingestMessage( + googleGetFn=AsyncMock(return_value={}), + knowledgeService=ks, + connectionId="c-1", + mandateId="", + userId="u-1", + labelId="INBOX", + message=self._make_message(), + limits=limits, + result=result, + progressCb=None, + ) + ) + + assert len(captured_jobs) == 1 + assert captured_jobs[0].neutralize is True + + def test_metadata_depth_yields_only_header(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import ( + _buildContentObjects, + ) + + message = self._make_message(snippet="hi", body_text="should be excluded") + parts = _buildContentObjects(message, maxBodyChars=4000, mailContentDepth="metadata") + ids = [p["contentObjectId"] for p in parts] + assert ids == ["header"] + + def test_snippet_depth_yields_header_and_snippet(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import ( + _buildContentObjects, + ) + + message = self._make_message(snippet="hi", body_text="should be excluded") + parts = _buildContentObjects(message, maxBodyChars=4000, mailContentDepth="snippet") + ids = [p["contentObjectId"] for p in parts] + assert "header" in ids + assert "snippet" in ids + assert "body" not in ids + + +# --------------------------------------------------------------------------- +# 5. ClickUp walker respects clickupScope="titles" +# --------------------------------------------------------------------------- + +class TestClickupWalkerScope(unittest.TestCase): + def _make_task(self): + return { + "id": "task-1", + "name": "Ship feature X", + "date_updated": "1713888000000", + "description": "This should be omitted", + "text_content": "Also omitted", + "status": {"status": "open"}, + "assignees": [], + "tags": [], + "list": {"name": "Backlog"}, + "folder": {}, + "space": {"name": "Engineering"}, + } + + def test_titles_scope_omits_description(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import ( + ClickupBootstrapLimits, + _buildContentObjects, + ) + + limits = ClickupBootstrapLimits(clickupScope="titles") + parts = _buildContentObjects(self._make_task(), limits) + ids = [p["contentObjectId"] for p in parts] + assert ids == ["header"] + assert "description" not in ids + + def test_with_description_scope_includes_description(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import ( + ClickupBootstrapLimits, + _buildContentObjects, + ) + + limits = ClickupBootstrapLimits(clickupScope="title_description") + parts = _buildContentObjects(self._make_task(), limits) + ids = [p["contentObjectId"] for p in parts] + assert "header" in ids + assert "description" in ids + + +if __name__ == "__main__": + unittest.main()