Merge pull request #150 from valueonag/int

Int
This commit is contained in:
Patrick Motsch 2026-05-01 00:01:39 +02:00 committed by GitHub
commit d3d682fe4d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
52 changed files with 7128 additions and 1613 deletions

10
app.py
View file

@ -405,6 +405,16 @@ async def lifespan(app: FastAPI):
except Exception as e:
logger.warning(f"BackgroundJob recovery failed (non-critical): {e}")
# Subscribe knowledge ingestion to connection lifecycle events so OAuth
# connect/disconnect reliably trigger bootstrap/purge.
try:
from modules.serviceCenter.services.serviceKnowledge.subConnectorIngestConsumer import (
registerKnowledgeIngestionConsumer,
)
registerKnowledgeIngestionConsumer()
except Exception as e:
logger.warning(f"KnowledgeIngestionConsumer registration failed (non-critical): {e}")
yield
# --- Stop Managers ---

View file

@ -126,6 +126,11 @@ def _stripGraphBase(url: str) -> str:
def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> ExternalEntry:
isFolder = "folder" in item
# Graph exposes the driveItem content hash as ``eTag`` (quoted) or
# ``cTag``; we normalise to a "revision" string so callers can use it as a
# stable ``contentVersion`` for idempotent ingestion without re-downloading
# file bytes.
revision = item.get("eTag") or item.get("cTag")
return ExternalEntry(
name=item.get("name", ""),
path=f"{basePath}/{item.get('name', '')}" if basePath else item.get("name", ""),
@ -137,6 +142,9 @@ def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> Exter
"id": item.get("id"),
"webUrl": item.get("webUrl"),
"childCount": item.get("folder", {}).get("childCount") if isFolder else None,
"revision": revision,
"lastModifiedDateTime": item.get("lastModifiedDateTime"),
"parentReference": item.get("parentReference", {}),
},
)
@ -167,21 +175,36 @@ class SharepointAdapter(_GraphApiMixin, ServiceAdapter):
return await self._discoverSites()
if not folderPath or folderPath == "/":
endpoint = f"sites/{siteId}/drive/root/children"
endpoint: Optional[str] = f"sites/{siteId}/drive/root/children?$top=200"
else:
cleanPath = folderPath.lstrip("/")
endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children"
endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children?$top=200"
result = await self._graphGet(endpoint)
if "error" in result:
logger.warning(f"SharePoint browse failed: {result['error']}")
return []
# Follow @odata.nextLink until a hard cap is reached so large libraries
# are fully enumerated (required for bootstrap). Per-page size uses
# Graph's max supported value to minimise round-trips.
effectiveLimit = int(limit) if limit is not None else None
items: List[Dict[str, Any]] = []
hardCap = 5000
while endpoint and len(items) < hardCap:
result = await self._graphGet(endpoint)
if "error" in result:
logger.warning(f"SharePoint browse failed: {result['error']}")
break
for raw in result.get("value", []) or []:
items.append(raw)
if effectiveLimit is not None and len(items) >= effectiveLimit:
break
if effectiveLimit is not None and len(items) >= effectiveLimit:
break
nextLink = result.get("@odata.nextLink")
endpoint = _stripGraphBase(nextLink) if nextLink else None
entries = [_graphItemToExternalEntry(item, path) for item in result.get("value", [])]
entries = [_graphItemToExternalEntry(item, path) for item in items]
if filter:
entries = [e for e in entries if _matchFilter(e, filter)]
if limit is not None:
entries = entries[: max(1, int(limit))]
if effectiveLimit is not None:
entries = entries[: max(1, effectiveLimit)]
return entries
async def _discoverSites(self) -> List[ExternalEntry]:

View file

@ -95,7 +95,14 @@ class ExtractionOptions(BaseModel):
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
# Merging strategy
mergeStrategy: MergeStrategy = Field(default_factory=MergeStrategy, description="Strategy for merging extraction results")
mergeStrategy: Optional[MergeStrategy] = Field(
default_factory=MergeStrategy,
description=(
"Strategy for merging extraction results. Pass None to skip merging entirely "
"(required for per-chunk ingestion pipelines like RAG, where per-page/per-section "
"granularity must be preserved for embedding)."
),
)
# Optional chunking parameters (for backward compatibility)
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")

View file

@ -1,82 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""FileFolder: hierarchical folder structure for file organization."""
from typing import Optional
from pydantic import BaseModel, Field
from modules.datamodels.datamodelBase import PowerOnModel
from modules.shared.i18nRegistry import i18nModel
import uuid
@i18nModel("Dateiordner")
class FileFolder(PowerOnModel):
"""Hierarchischer Ordner fuer die Dateiverwaltung."""
id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="Primary key",
json_schema_extra={"label": "ID", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False},
)
name: str = Field(
description="Folder name",
json_schema_extra={"label": "Name", "frontend_type": "text", "frontend_readonly": False, "frontend_required": True},
)
parentId: Optional[str] = Field(
default=None,
description="Parent folder ID (null = root)",
json_schema_extra={
"label": "Uebergeordneter Ordner",
"frontend_type": "text",
"frontend_readonly": False,
"frontend_required": False,
"fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
},
)
mandateId: Optional[str] = Field(
default=None,
description="Mandate context",
json_schema_extra={
"label": "Mandanten-ID",
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False,
"fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"},
},
)
featureInstanceId: Optional[str] = Field(
default=None,
description="Feature instance context",
json_schema_extra={
"label": "Feature-Instanz-ID",
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False,
"fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
},
)
scope: str = Field(
default="personal",
description="Data visibility scope: personal, featureInstance, mandate, global. Inherited by files in this folder.",
json_schema_extra={
"label": "Sichtbarkeit",
"frontend_type": "select",
"frontend_readonly": False,
"frontend_required": False,
"frontend_options": [
{"value": "personal", "label": "Persönlich"},
{"value": "featureInstance", "label": "Feature-Instanz"},
{"value": "mandate", "label": "Mandant"},
{"value": "global", "label": "Global"},
],
},
)
neutralize: bool = Field(
default=False,
description="Whether files in this folder should be neutralized before AI processing. Inherited by new/moved files.",
json_schema_extra={
"label": "Neutralisieren",
"frontend_type": "checkbox",
"frontend_readonly": False,
"frontend_required": False,
},
)

View file

@ -68,17 +68,6 @@ class FileItem(PowerOnModel):
description="Tags for categorization and search",
json_schema_extra={"label": "Tags", "frontend_type": "tags", "frontend_readonly": False, "frontend_required": False},
)
folderId: Optional[str] = Field(
default=None,
description="ID of the parent folder",
json_schema_extra={
"label": "Ordner-ID",
"frontend_type": "text",
"frontend_readonly": False,
"frontend_required": False,
"fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
},
)
description: Optional[str] = Field(
default=None,
description="User-provided description of the file",

View file

@ -90,6 +90,16 @@ class FileContentIndex(PowerOnModel):
description="Data visibility scope: personal, featureInstance, mandate, global",
json_schema_extra={"label": "Sichtbarkeit"},
)
sourceKind: str = Field(
default="file",
description="Origin of the indexed content: file, sharepoint_item, outlook_message, outlook_attachment, ...",
json_schema_extra={"label": "Quellenart"},
)
connectionId: Optional[str] = Field(
default=None,
description="UserConnection ID if this index entry originates from an external connector",
json_schema_extra={"label": "Connection-ID"},
)
neutralizationStatus: Optional[str] = Field(
default=None,
description="Neutralization status: completed, failed, skipped, None = not required",

View file

@ -13,6 +13,42 @@ import math
T = TypeVar('T')
# ---------------------------------------------------------------------------
# Table Grouping models
# ---------------------------------------------------------------------------
class TableGroupNode(BaseModel):
"""
A single node in a user-defined group tree for a FormGeneratorTable.
Items belong to exactly one group (no multi-membership).
Groups can be nested to arbitrary depth via subGroups.
"""
id: str
name: str
itemIds: List[str] = Field(default_factory=list)
subGroups: List['TableGroupNode'] = Field(default_factory=list)
order: int = 0
isExpanded: bool = True
TableGroupNode.model_rebuild()
class TableGrouping(BaseModel):
"""
Persisted grouping configuration for one (user, contextKey) pair.
Stored in table_groupings in poweron_app (auto-created).
contextKey convention: API path without /api/ prefix and without trailing slash.
Examples: "connections", "prompts", "admin/users", "trustee/{instanceId}/documents"
"""
id: str
userId: str
contextKey: str
rootGroups: List[TableGroupNode] = Field(default_factory=list)
updatedAt: Optional[float] = None
class SortField(BaseModel):
"""
Single sort field configuration.
@ -24,6 +60,17 @@ class SortField(BaseModel):
class PaginationParams(BaseModel):
"""
Complete pagination state including page, sorting, and filters.
Grouping extensions (both optional omit when not using grouping):
groupId Scope the request to items belonging to this group.
The backend resolves it to an itemIds IN-filter before
applying normal pagination/search/filter logic.
Also applied for mode=ids and mode=filterValues so that
bulk-select and filter-dropdowns respect the group scope.
saveGroupTree If present the backend persists this tree for the current
(user, contextKey) pair *before* fetching, then returns
the confirmed tree in the response groupTree field.
Omit on every request that does not change the group tree.
"""
page: int = Field(ge=1, description="Current page number (1-based)")
pageSize: int = Field(ge=1, le=1000, description="Number of items per page")
@ -38,6 +85,14 @@ class PaginationParams(BaseModel):
- Supported operators: equals/eq, contains, startsWith, endsWith, gt, gte, lt, lte, in, notIn
- Multiple filters are combined with AND logic"""
)
groupId: Optional[str] = Field(
default=None,
description="Scope request to items of this group (resolved server-side to itemIds IN-filter)",
)
saveGroupTree: Optional[List[Dict[str, Any]]] = Field(
default=None,
description="If set, persist this group tree before fetching (optimistic save)",
)
class PaginationRequest(BaseModel):
@ -74,9 +129,18 @@ class PaginationMetadata(BaseModel):
class PaginatedResponse(BaseModel, Generic[T]):
"""
Response containing paginated data and metadata.
groupTree is included when the endpoint supports table grouping and the
current user has a saved group tree for the requested contextKey.
It is None when grouping is not configured for the endpoint or the user
has not created any groups yet. Frontend must treat None as an empty tree.
"""
items: List[T] = Field(..., description="Array of items for current page")
pagination: Optional[PaginationMetadata] = Field(..., description="Pagination metadata (None if pagination not applied)")
groupTree: Optional[List[TableGroupNode]] = Field(
default=None,
description="Current group tree for this (user, contextKey) pair — None if no grouping configured",
)
model_config = ConfigDict(arbitrary_types_allowed=True)
@ -85,6 +149,7 @@ def normalize_pagination_dict(pagination_dict: Dict[str, Any]) -> Dict[str, Any]
"""
Normalize pagination dictionary to handle frontend variations.
Moves top-level "search" field into filters if present.
Grouping fields (groupId, saveGroupTree) are passed through as-is.
Args:
pagination_dict: Raw pagination dictionary from frontend
@ -110,4 +175,7 @@ def normalize_pagination_dict(pagination_dict: Dict[str, Any]) -> Dict[str, Any]
normalized["filters"] = {}
normalized["filters"]["search"] = normalized.pop("search")
# groupId / saveGroupTree are valid PaginationParams fields — pass through unchanged.
# No transformation needed; Pydantic will validate them.
return normalized

View file

@ -475,6 +475,22 @@ class UserConnection(PowerOnModel):
description="OAuth scopes granted for this connection",
json_schema_extra={"frontend_type": "list", "frontend_readonly": True, "frontend_required": False, "label": "Gewährte Berechtigungen"},
)
knowledgeIngestionEnabled: bool = Field(
default=False,
description="Whether the user has consented to knowledge ingestion for this connection",
json_schema_extra={"frontend_type": "boolean", "frontend_readonly": False, "frontend_required": False, "label": "Wissensdatenbank aktiv"},
)
knowledgePreferences: Optional[Dict[str, Any]] = Field(
default=None,
description=(
"Per-connection knowledge ingestion preferences. schemaVersion=1 keys: "
"neutralizeBeforeEmbed (bool), mailContentDepth (metadata|snippet|full), "
"mailIndexAttachments (bool), filesIndexBinaries (bool), mimeAllowlist (list[str]), "
"clickupScope (titles|title_description|with_comments), "
"surfaceToggles (dict per authority), maxAgeDays (int)."
),
json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False, "label": "Wissenspräferenzen"},
)
@computed_field
@property

View file

@ -174,14 +174,26 @@ async def indexSessionData(
for c in chunks
]
await knowledgeService.indexFile(
fileId=syntheticFileId,
fileName=f"coaching-session-{sessionId[:8]}",
mimeType="application/x-coaching-session",
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
contentObjects=contentObjects,
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="coaching_session",
sourceId=syntheticFileId,
fileName=f"coaching-session-{sessionId[:8]}",
mimeType="application/x-coaching-session",
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
contentObjects=contentObjects,
provenance={
"lane": "feature",
"feature": "commcoach",
"sessionId": sessionId,
"contextId": contextId,
"messageCount": len(messages or []),
},
)
)
logger.info(f"Successfully indexed coaching session {sessionId} ({len(chunks)} chunks)")
except Exception as e:

View file

@ -83,7 +83,7 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = {
PortField(name="listId", type="str", description="ClickUp-Listen-ID"),
PortField(name="name", type="str", required=False, description="Listenname"),
PortField(name="spaceId", type="str", required=False, description="Space-ID"),
PortField(name="folderId", type="str", required=False, description="Ordner-ID"),
PortField(name="groupId", type="str", required=False, description="Gruppen-ID für die Gruppierungszuordnung"),
PortField(name="connection", type="ConnectionRef", required=False,
description="ClickUp-Verbindung"),
]),

View file

@ -1208,7 +1208,7 @@ async def patchWorkspaceWorkflowAttachments(
# ---------------------------------------------------------------------------
# File and folder list endpoints
# File endpoints
# ---------------------------------------------------------------------------
@router.get("/{instanceId}/files")
@ -1216,7 +1216,6 @@ async def patchWorkspaceWorkflowAttachments(
async def listWorkspaceFiles(
request: Request,
instanceId: str = Path(...),
folderId: Optional[str] = Query(None),
tags: Optional[str] = Query(None),
search: Optional[str] = Query(None),
context: RequestContext = Depends(getRequestContext),
@ -1271,30 +1270,6 @@ async def getFileContent(
return Response(content=content, media_type=mimeType)
@router.get("/{instanceId}/folders")
@limiter.limit("300/minute")
async def listWorkspaceFolders(
request: Request,
instanceId: str = Path(...),
parentId: Optional[str] = Query(None),
context: RequestContext = Depends(getRequestContext),
):
_mandateId, _ = _validateInstanceAccess(instanceId, context)
try:
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
ctx = ServiceCenterContext(
user=context.user,
mandate_id=_mandateId or "",
feature_instance_id=instanceId,
)
chatService = getService("chat", ctx)
folders = chatService.listFolders(parentId=parentId)
return JSONResponse({"folders": folders or []})
except Exception:
return JSONResponse({"folders": []})
@router.get("/{instanceId}/datasources")
@limiter.limit("300/minute")
async def listWorkspaceDataSources(

View file

@ -1268,19 +1268,7 @@ class AppObjects:
result = []
for conn_dict in connections:
try:
# Create UserConnection object
connection = UserConnection(
id=conn_dict["id"],
userId=conn_dict["userId"],
authority=conn_dict.get("authority"),
externalId=conn_dict.get("externalId", ""),
externalUsername=conn_dict.get("externalUsername", ""),
externalEmail=conn_dict.get("externalEmail"),
status=conn_dict.get("status", "pending"),
connectedAt=conn_dict.get("connectedAt"),
lastChecked=conn_dict.get("lastChecked"),
expiresAt=conn_dict.get("expiresAt"),
)
connection = UserConnection.model_validate(conn_dict)
result.append(connection)
except Exception as e:
logger.error(
@ -1293,6 +1281,28 @@ class AppObjects:
logger.error(f"Error getting user connections: {str(e)}")
return []
def getActiveKnowledgeConnections(self) -> List[UserConnection]:
"""Return all UserConnections with knowledgeIngestionEnabled=True and status=active.
Used by the daily re-sync scheduler to determine which connections to re-index.
"""
try:
rows = self.db.getRecordset(
UserConnection,
recordFilter={"knowledgeIngestionEnabled": True, "status": ConnectionStatus.ACTIVE.value},
)
result = []
for row in rows or []:
try:
conn = UserConnection.model_validate(row) if isinstance(row, dict) else row
result.append(conn)
except Exception as _e:
logger.warning(f"getActiveKnowledgeConnections: could not parse row: {_e}")
return result
except Exception as e:
logger.error(f"getActiveKnowledgeConnections failed: {e}")
return []
def getUserConnectionById(self, connectionId: str) -> Optional[UserConnection]:
"""Get a single UserConnection by ID or by reference string (connection:authority:username)."""
try:
@ -1317,18 +1327,21 @@ class AppObjects:
if connections:
conn_dict = connections[0]
return UserConnection(
id=conn_dict["id"],
userId=conn_dict["userId"],
authority=conn_dict.get("authority"),
externalId=conn_dict.get("externalId", ""),
externalUsername=conn_dict.get("externalUsername", ""),
externalEmail=conn_dict.get("externalEmail"),
status=conn_dict.get("status", "pending"),
connectedAt=conn_dict.get("connectedAt"),
lastChecked=conn_dict.get("lastChecked"),
expiresAt=conn_dict.get("expiresAt"),
)
try:
return UserConnection.model_validate(conn_dict)
except Exception:
return UserConnection(
id=conn_dict["id"],
userId=conn_dict["userId"],
authority=conn_dict.get("authority"),
externalId=conn_dict.get("externalId", ""),
externalUsername=conn_dict.get("externalUsername", ""),
externalEmail=conn_dict.get("externalEmail"),
status=conn_dict.get("status", "pending"),
connectedAt=conn_dict.get("connectedAt"),
lastChecked=conn_dict.get("lastChecked"),
expiresAt=conn_dict.get("expiresAt"),
)
return None
except Exception as e:
logger.error(f"Error getting user connection by ID: {str(e)}")
@ -4014,6 +4027,59 @@ class AppObjects:
logger.error(f"Error deleting role {roleId}: {str(e)}")
raise
# -------------------------------------------------------------------------
# Table Grouping (user-defined groups for FormGeneratorTable instances)
# -------------------------------------------------------------------------
def getTableGrouping(self, contextKey: str):
"""
Load the group tree for the current user and the given contextKey.
Returns a TableGrouping instance or None if no grouping has been saved yet.
contextKey identifies the table instance, e.g. "connections", "prompts",
"admin/users", "trustee/{instanceId}/documents".
"""
from modules.datamodels.datamodelPagination import TableGrouping
try:
records = self.db.getRecordset(
TableGrouping,
recordFilter={"userId": str(self.userId), "contextKey": contextKey},
)
if not records:
return None
row = records[0]
return TableGrouping.model_validate(row) if isinstance(row, dict) else row
except Exception as e:
logger.error(f"getTableGrouping failed for user={self.userId} key={contextKey}: {e}")
return None
def upsertTableGrouping(self, contextKey: str, rootGroups: list):
"""
Create or replace the group tree for the current user and contextKey.
rootGroups is a list of TableGroupNode-compatible dicts (the full tree).
Returns the saved TableGrouping instance.
"""
from modules.datamodels.datamodelPagination import TableGrouping
from modules.shared.timeUtils import getUtcTimestamp
try:
existing = self.getTableGrouping(contextKey)
data = {
"id": existing.id if existing else str(uuid.uuid4()),
"userId": str(self.userId),
"contextKey": contextKey,
"rootGroups": rootGroups,
"updatedAt": getUtcTimestamp(),
}
if existing:
self.db.recordModify(TableGrouping, existing.id, data)
else:
self.db.recordCreate(TableGrouping, data)
return TableGrouping.model_validate(data)
except Exception as e:
logger.error(f"upsertTableGrouping failed for user={self.userId} key={contextKey}: {e}")
raise
# Public Methods

View file

@ -93,6 +93,46 @@ class KnowledgeObjects:
self.db.recordModify(FileContentIndex, fileId, {"status": status})
return True
def deleteFileContentIndexByConnectionId(self, connectionId: str) -> Dict[str, int]:
"""Delete all FileContentIndex rows (and their ContentChunks) for a connection.
Used when a UserConnection is revoked / disconnected so the knowledge corpus
no longer references data the user no longer grants access to. Returns a dict
with counts to support observability logs.
"""
if not connectionId:
return {"indexRows": 0, "chunks": 0}
rows = self.db.getRecordset(
FileContentIndex, recordFilter={"connectionId": connectionId}
)
mandateIds: set = set()
chunkCount = 0
indexCount = 0
for row in rows:
fid = row.get("id") if isinstance(row, dict) else getattr(row, "id", None)
mid = row.get("mandateId") if isinstance(row, dict) else getattr(row, "mandateId", "")
if not fid:
continue
chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fid})
for chunk in chunks:
if self.db.recordDelete(ContentChunk, chunk["id"]):
chunkCount += 1
if self.db.recordDelete(FileContentIndex, fid):
indexCount += 1
if mid:
mandateIds.add(str(mid))
for mid in mandateIds:
try:
from modules.interfaces.interfaceDbBilling import _getRootInterface
_getRootInterface().reconcileMandateStorageBilling(mid)
except Exception as ex:
logger.warning("reconcileMandateStorageBilling after connection purge failed: %s", ex)
return {"indexRows": indexCount, "chunks": chunkCount}
def deleteFileContentIndex(self, fileId: str) -> bool:
"""Delete a FileContentIndex and all associated ContentChunks."""
existing = self.getFileContentIndex(fileId)

View file

@ -20,7 +20,6 @@ from modules.security.rbac import RbacClass
from modules.datamodels.datamodelRbac import AccessRuleContext
from modules.datamodels.datamodelUam import AccessLevel
from modules.datamodels.datamodelFiles import FilePreview, FileItem, FileData
from modules.datamodels.datamodelFileFolder import FileFolder
from modules.datamodels.datamodelUtils import Prompt
from modules.datamodels.datamodelMessaging import (
MessagingSubscription,
@ -1103,15 +1102,12 @@ class ComponentObjects:
return newfileName
counter += 1
def createFile(self, name: str, mimeType: str, content: bytes, folderId: Optional[str] = None) -> FileItem:
def createFile(self, name: str, mimeType: str, content: bytes) -> FileItem:
"""Creates a new file entry if user has permission. Computes fileHash and fileSize from content.
Duplicate check: if a file with the same user + fileHash + fileName already exists,
the existing file is returned instead of creating a new one.
Same hash with different name is allowed (intentional copy by user).
Args:
folderId: Optional parent folder ID. None/empty means the root folder.
"""
if not self.checkRbacPermission(FileItem, "create"):
raise PermissionError("No permission to create files")
@ -1139,11 +1135,6 @@ class ComponentObjects:
else:
scope = "personal"
# Normalize folderId: treat empty string as "no folder" (= root) NULL in DB
normalizedFolderId: Optional[str] = folderId
if isinstance(normalizedFolderId, str) and not normalizedFolderId.strip():
normalizedFolderId = None
fileItem = FileItem(
mandateId=mandateId,
featureInstanceId=featureInstanceId,
@ -1152,7 +1143,6 @@ class ComponentObjects:
mimeType=mimeType,
fileSize=fileSize,
fileHash=fileHash,
folderId=normalizedFolderId,
)
# Store in database
@ -1277,382 +1267,47 @@ class ComponentObjects:
self.db.connection.rollback()
raise FileDeletionError(f"Error deleting files in batch: {str(e)}")
# ---- Folder methods ----
_RESERVED_FOLDER_NAMES = {"(Global)"}
def _validateFolderName(self, name: str, parentId: Optional[str], excludeFolderId: Optional[str] = None):
"""Ensures folder name is not reserved and is unique within parent."""
if name in self._RESERVED_FOLDER_NAMES:
raise ValueError(f"Folder name '{name}' is reserved")
if not name or not name.strip():
raise ValueError("Folder name cannot be empty")
existingFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": parentId or ""})
for f in existingFolders:
if f.get("name") == name and f.get("id") != excludeFolderId:
raise ValueError(f"Folder '{name}' already exists in this directory")
def _isDescendantOf(self, folderId: str, ancestorId: str) -> bool:
"""Checks if folderId is a descendant of ancestorId (circular reference check)."""
visited = set()
currentId = folderId
while currentId:
if currentId == ancestorId:
return True
if currentId in visited:
break
visited.add(currentId)
folders = self.db.getRecordset(FileFolder, recordFilter={"id": currentId})
if not folders:
break
currentId = folders[0].get("parentId")
return False
def _ensureFeatureInstanceFolder(self, featureInstanceId: str, mandateId: str = "") -> Optional[str]:
"""Return the folder ID for a feature instance, creating it on first use.
The folder is named after the feature instance label."""
existing = self.db.getRecordset(
FileFolder,
recordFilter={
"featureInstanceId": featureInstanceId,
"sysCreatedBy": self.userId or "",
},
)
if existing:
return existing[0].get("id")
# Resolve the instance label for the folder name
folderName = featureInstanceId[:8]
def _ensureFeatureInstanceGroup(self, featureInstanceId: str, contextKey: str = "files/list") -> Optional[str]:
"""Return the groupId of the default group for a feature instance.
Creates the group if it doesn't exist yet."""
try:
from modules.datamodels.datamodelFeatures import FeatureInstance
from modules.security.rootAccess import getRootDbAppConnector
dbApp = getRootDbAppConnector()
instances = dbApp.getRecordset(FeatureInstance, recordFilter={"id": featureInstanceId})
if instances:
folderName = instances[0].get("label") or folderName
import modules.interfaces.interfaceDbApp as _appIface
appInterface = _appIface.getInterface(self._currentUser)
existing = appInterface.getTableGrouping(contextKey)
nodes = [n.model_dump() if hasattr(n, 'model_dump') else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
# Look for group with name matching featureInstanceId
def _find(nds):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
nmeta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {})
if (nmeta or {}).get("featureInstanceId") == featureInstanceId:
return nid
subs = nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])
result = _find(subs)
if result:
return result
return None
found = _find(nodes)
if found:
return found
# Create new group
import uuid
newId = str(uuid.uuid4())
newGroup = {
"id": newId,
"name": featureInstanceId,
"itemIds": [],
"subGroups": [],
"meta": {"featureInstanceId": featureInstanceId},
}
nodes.append(newGroup)
appInterface.upsertTableGrouping(contextKey, nodes)
return newId
except Exception as e:
logger.warning(f"Could not resolve feature instance label: {e}")
logger.error(f"_ensureFeatureInstanceGroup failed: {e}")
return None
folder = FileFolder(
name=folderName,
parentId=None,
mandateId=mandateId,
featureInstanceId=featureInstanceId,
)
created = self.db.recordCreate(FileFolder, folder)
return created.get("id") if isinstance(created, dict) else getattr(created, "id", None)
def getFolder(self, folderId: str) -> Optional[Dict[str, Any]]:
"""Returns a folder by ID if it belongs to the current user."""
folders = self.db.getRecordset(FileFolder, recordFilter={"id": folderId, "sysCreatedBy": self.userId or ""})
return folders[0] if folders else None
def listFolders(self, parentId: Optional[str] = None) -> List[Dict[str, Any]]:
"""List folders visible to the current user.
Own folders are always returned. Other users' folders are only
returned when they contain files visible to the current user.
Each folder is enriched with ``fileCount``."""
recordFilter = {}
if parentId is not None:
recordFilter["parentId"] = parentId
folders = self.db.getRecordset(FileFolder, recordFilter=recordFilter if recordFilter else None)
if not folders:
return folders
folderIds = [f["id"] for f in folders if f.get("id")]
fileCounts: Dict[str, int] = {}
try:
from modules.interfaces.interfaceRbac import buildFilesScopeWhereClause
scopeClause = buildFilesScopeWhereClause(
self.currentUser, "FileItem", self.db,
self.mandateId, self.featureInstanceId,
[], [],
)
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
baseQuery = (
'SELECT "folderId", COUNT(*) AS cnt '
'FROM "FileItem" '
'WHERE "folderId" = ANY(%s)'
)
queryValues: list = [folderIds]
if scopeClause:
baseQuery += ' AND (' + scopeClause["condition"] + ')'
queryValues.extend(scopeClause["values"])
baseQuery += ' GROUP BY "folderId"'
cursor.execute(baseQuery, queryValues)
for row in cursor.fetchall():
fileCounts[row["folderId"]] = row["cnt"]
except Exception as e:
logger.warning(f"Could not count files per folder: {e}")
userId = self.userId or ""
result = []
for folder in folders:
fc = fileCounts.get(folder.get("id", ""), 0)
folder["fileCount"] = fc
isOwn = folder.get("sysCreatedBy") == userId
if isOwn or fc > 0:
result.append(folder)
return result
def createFolder(self, name: str, parentId: Optional[str] = None) -> Dict[str, Any]:
"""Create a new folder with unique name validation."""
self._validateFolderName(name, parentId)
folder = FileFolder(
name=name,
parentId=parentId,
mandateId=self.mandateId or "",
featureInstanceId=self.featureInstanceId or "",
)
return self.db.recordCreate(FileFolder, folder)
def renameFolder(self, folderId: str, newName: str) -> bool:
"""Rename a folder with unique name validation."""
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
self._validateFolderName(newName, folder.get("parentId"), excludeFolderId=folderId)
return self.db.recordModify(FileFolder, folderId, {"name": newName})
def updateFolder(self, folderId: str, updateData: Dict[str, Any]) -> bool:
"""
Update folder metadata (e.g. ``scope``, ``neutralize``). Owner-only,
same access model as renameFolder/moveFolder. Use ``renameFolder`` for
``name`` changes (uniqueness validation) and ``moveFolder`` for
``parentId`` changes (cycle/uniqueness validation).
"""
if not updateData:
return True
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
forbiddenKeys = {"id", "sysCreatedBy", "sysCreatedAt", "sysUpdatedAt"}
cleaned: Dict[str, Any] = {k: v for k, v in updateData.items() if k not in forbiddenKeys}
if "name" in cleaned:
self._validateFolderName(cleaned["name"], folder.get("parentId"), excludeFolderId=folderId)
return self.db.recordModify(FileFolder, folderId, cleaned)
def moveFolder(self, folderId: str, targetParentId: Optional[str] = None) -> bool:
"""Move a folder to a new parent, with circular reference and unique name checks."""
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
if targetParentId and self._isDescendantOf(targetParentId, folderId):
raise ValueError("Cannot move folder into its own subtree")
self._validateFolderName(folder.get("name", ""), targetParentId, excludeFolderId=folderId)
return self.db.recordModify(FileFolder, folderId, {"parentId": targetParentId})
def moveFilesBatch(self, fileIds: List[str], targetFolderId: Optional[str] = None) -> Dict[str, Any]:
"""Move multiple files with one SQL update.
Owner can always move; non-owners need RBAC ALL level."""
uniqueIds = [str(fid) for fid in dict.fromkeys(fileIds or []) if fid]
if not uniqueIds:
return {"movedFiles": 0}
if targetFolderId:
targetFolder = self.getFolder(targetFolderId)
if not targetFolder:
raise FileNotFoundError(f"Target folder {targetFolderId} not found")
try:
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
cursor.execute(
'SELECT "id", "sysCreatedBy" FROM "FileItem" WHERE "id" = ANY(%s)',
(uniqueIds,),
)
rows = cursor.fetchall()
foundIds = {row["id"] for row in rows}
missing = sorted(set(uniqueIds) - foundIds)
if missing:
raise FileNotFoundError(f"Files not found: {missing}")
for row in rows:
self._requireFileWriteAccess(row, row["id"], "update")
accessibleIds = [row["id"] for row in rows]
cursor.execute(
'UPDATE "FileItem" SET "folderId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s '
'WHERE "id" = ANY(%s)',
(targetFolderId, getUtcTimestamp(), self.userId or "", accessibleIds),
)
movedFiles = cursor.rowcount
self.db.connection.commit()
return {"movedFiles": movedFiles}
except Exception as e:
logger.error(f"Error moving files in batch: {e}")
self.db.connection.rollback()
raise FileError(f"Error moving files in batch: {str(e)}")
def moveFoldersBatch(self, folderIds: List[str], targetParentId: Optional[str] = None) -> Dict[str, Any]:
"""Move multiple folders with one SQL update after validation."""
uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid]
if not uniqueIds:
return {"movedFolders": 0}
foldersToMove: List[Dict[str, Any]] = []
for folderId in uniqueIds:
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
if targetParentId and self._isDescendantOf(targetParentId, folderId):
raise ValueError("Cannot move folder into its own subtree")
foldersToMove.append(folder)
existingInTarget = self.db.getRecordset(
FileFolder,
recordFilter={"parentId": targetParentId or "", "sysCreatedBy": self.userId or ""},
)
existingNames = {f.get("name"): f.get("id") for f in existingInTarget}
movingNames: Dict[str, str] = {}
movingIds = set(uniqueIds)
for folder in foldersToMove:
name = folder.get("name", "")
folderId = folder.get("id")
if name in movingNames and movingNames[name] != folderId:
raise ValueError(f"Folder '{name}' already exists in this move batch")
movingNames[name] = folderId
existingId = existingNames.get(name)
if existingId and existingId not in movingIds:
raise ValueError(f"Folder '{name}' already exists in target directory")
try:
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
cursor.execute(
'UPDATE "FileFolder" SET "parentId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s '
'WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(targetParentId, getUtcTimestamp(), self.userId or "", uniqueIds, self.userId or ""),
)
movedFolders = cursor.rowcount
self.db.connection.commit()
return {"movedFolders": movedFolders}
except Exception as e:
logger.error(f"Error moving folders in batch: {e}")
self.db.connection.rollback()
raise FileError(f"Error moving folders in batch: {str(e)}")
def deleteFolder(self, folderId: str, recursive: bool = False) -> Dict[str, Any]:
"""Delete a folder. If recursive, deletes all contents. Returns summary of deletions."""
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
childFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": folderId, "sysCreatedBy": self.userId or ""})
childFiles = self._getFilesByCurrentUser(recordFilter={"folderId": folderId})
if not recursive and (childFolders or childFiles):
raise ValueError(
f"Folder '{folder.get('name')}' is not empty "
f"({len(childFiles)} files, {len(childFolders)} subfolders). "
f"Use recursive=true to delete contents."
)
deletedFiles = 0
deletedFolders = 0
if recursive:
for subFolder in childFolders:
subResult = self.deleteFolder(subFolder["id"], recursive=True)
deletedFiles += subResult.get("deletedFiles", 0)
deletedFolders += subResult.get("deletedFolders", 0)
for childFile in childFiles:
try:
self.deleteFile(childFile["id"])
deletedFiles += 1
except Exception as e:
logger.warning(f"Failed to delete file {childFile['id']} during folder deletion: {e}")
self.db.recordDelete(FileFolder, folderId)
deletedFolders += 1
return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
def deleteFoldersBatch(self, folderIds: List[str], recursive: bool = True) -> Dict[str, Any]:
"""Delete multiple folders and their content in batched SQL calls."""
uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid]
if not uniqueIds:
return {"deletedFiles": 0, "deletedFolders": 0}
if not recursive:
deletedFiles = 0
deletedFolders = 0
for folderId in uniqueIds:
result = self.deleteFolder(folderId, recursive=False)
deletedFiles += result.get("deletedFiles", 0)
deletedFolders += result.get("deletedFolders", 0)
return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
try:
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
cursor.execute(
'SELECT "id" FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(uniqueIds, self.userId or ""),
)
rootAccessibleIds = [row["id"] for row in cursor.fetchall()]
if len(rootAccessibleIds) != len(uniqueIds):
missingIds = sorted(set(uniqueIds) - set(rootAccessibleIds))
raise FileNotFoundError(f"Folders not found or not accessible: {missingIds}")
cursor.execute(
"""
WITH RECURSIVE folder_tree AS (
SELECT "id"
FROM "FileFolder"
WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s
UNION ALL
SELECT child."id"
FROM "FileFolder" child
INNER JOIN folder_tree ft ON child."parentId" = ft."id"
WHERE child."sysCreatedBy" = %s
)
SELECT DISTINCT "id" FROM folder_tree
""",
(rootAccessibleIds, self.userId or "", self.userId or ""),
)
allFolderIds = [row["id"] for row in cursor.fetchall()]
cursor.execute(
'SELECT "id" FROM "FileItem" WHERE "folderId" = ANY(%s) AND "sysCreatedBy" = %s',
(allFolderIds, self.userId or ""),
)
allFileIds = [row["id"] for row in cursor.fetchall()]
if allFileIds:
cursor.execute('DELETE FROM "FileData" WHERE "id" = ANY(%s)', (allFileIds,))
cursor.execute(
'DELETE FROM "FileItem" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(allFileIds, self.userId or ""),
)
deletedFiles = cursor.rowcount
else:
deletedFiles = 0
cursor.execute(
'DELETE FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(allFolderIds, self.userId or ""),
)
deletedFolders = cursor.rowcount
self.db.connection.commit()
return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
except Exception as e:
logger.error(f"Error deleting folders in batch: {e}")
self.db.connection.rollback()
raise FileDeletionError(f"Error deleting folders in batch: {str(e)}")
def copyFile(self, sourceFileId: str, targetFolderId: Optional[str] = None, newFileName: Optional[str] = None) -> FileItem:
def copyFile(self, sourceFileId: str, newFileName: Optional[str] = None) -> FileItem:
"""Create a full duplicate of a file (FileItem + FileData)."""
sourceFile = self.getFile(sourceFileId)
if not sourceFile:
@ -1665,11 +1320,6 @@ class ComponentObjects:
fileName = newFileName or sourceFile.fileName
copiedFile = self.createFile(fileName, sourceFile.mimeType, sourceData)
if targetFolderId:
self.updateFile(copiedFile.id, {"folderId": targetFolderId})
elif sourceFile.folderId:
self.updateFile(copiedFile.id, {"folderId": sourceFile.folderId})
self.createFileData(copiedFile.id, sourceData)
return copiedFile
@ -1884,18 +1534,14 @@ class ComponentObjects:
logger.error(f"Error getting file content: {str(e)}")
return None
def saveUploadedFile(self, fileContent: bytes, fileName: str, folderId: Optional[str] = None) -> tuple[FileItem, str]:
"""Saves an uploaded file if user has permission.
Args:
folderId: Optional parent folder ID. None means root folder.
"""
def saveUploadedFile(self, fileContent: bytes, fileName: str) -> tuple[FileItem, str]:
"""Saves an uploaded file if user has permission."""
try:
# Check file creation permission
if not self.checkRbacPermission(FileItem, "create"):
raise PermissionError("No permission to upload files")
logger.debug(f"Starting upload process for file: {fileName} (folderId={folderId!r})")
logger.debug(f"Starting upload process for file: {fileName}")
if not isinstance(fileContent, bytes):
logger.error(f"Invalid fileContent type: {type(fileContent)}")
@ -1921,7 +1567,6 @@ class ComponentObjects:
name=fileName,
mimeType=mimeType,
content=fileContent,
folderId=folderId,
)
# Save binary data

View file

@ -204,7 +204,6 @@ TABLE_NAMESPACE = {
# Files - benutzer-eigen
"FileItem": "files",
"FileData": "files",
"FileFolder": "files",
# Automation - benutzer-eigen
"AutomationDefinition": "automation",
"AutomationTemplate": "automation",
@ -529,8 +528,7 @@ def getRecordsetPaginatedWithRBAC(
if val is None:
# val=None in pagination.filters means "match empty/null"
# (same convention as connectorDbPostgre._buildPaginationClauses).
# Covers both historical empty-string values and true NULLs
# e.g. root-folder files where folderId may be "" or NULL.
# Covers both historical empty-string values and true NULLs.
whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')')
continue
if isinstance(val, dict):
@ -689,8 +687,7 @@ def getDistinctColumnValuesWithRBAC(
if val is None:
# val=None in pagination.filters means "match empty/null"
# (same convention as connectorDbPostgre._buildPaginationClauses).
# Covers both historical empty-string values and true NULLs
# e.g. root-folder files where folderId may be "" or NULL.
# Covers both historical empty-string values and true NULLs.
whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')')
continue
if isinstance(val, dict):

View file

View file

@ -0,0 +1,240 @@
"""
One-time migration: Convert FileFolder tree + FileItem.folderId table_groupings.
Run this BEFORE dropping the physical FileFolder table and FileItem.folderId column
from the database (those are separate Alembic/SQL steps).
Usage:
python -m modules.migrations.migrate_folders_to_groups [--dry-run] [--verbose]
Steps:
1. For each distinct (userId, mandateId) combination that has FileFolder records:
a. Build the full folder tree (recursive)
b. Write it as a TableGroupNode tree into table_groupings (contextKey='files/list')
merges with any existing groups rather than overwriting
c. For each FileItem with a folderId that maps into this tree,
add its id to the matching group's itemIds
2. Print a summary (rows migrated, groups created, files assigned)
3. If not --dry-run: commits the inserts/updates
NOTE: Schema changes (ALTER TABLE DROP COLUMN, DROP TABLE) are intentionally
NOT performed by this script. Run the corresponding Alembic migration
(migrations/versions/xxxx_drop_folder_columns.py) afterwards.
"""
import argparse
import json
import logging
import uuid
from typing import Optional
logger = logging.getLogger(__name__)
# ── Helpers ──────────────────────────────────────────────────────────────────
def _build_tree(folders: list, parent_id: Optional[str]) -> list:
"""Recursively build TableGroupNode-compatible dicts from a flat folder list."""
children = [f for f in folders if f.get("parentId") == parent_id]
result = []
for folder in children:
node = {
"id": str(uuid.uuid4()),
"name": folder["name"],
"itemIds": [],
"subGroups": _build_tree(folders, folder["id"]),
"meta": {"migratedFromFolderId": folder["id"]},
}
result.append(node)
return result
def _assign_files_to_nodes(nodes: list, files_by_folder: dict) -> list:
"""Recursively assign file IDs to group nodes based on folder mapping."""
for node in nodes:
folder_id = (node.get("meta") or {}).get("migratedFromFolderId")
if folder_id and folder_id in files_by_folder:
node["itemIds"] = list(files_by_folder[folder_id])
node["subGroups"] = _assign_files_to_nodes(node.get("subGroups", []), files_by_folder)
return nodes
def _count_items(nodes: list) -> int:
total = 0
for node in nodes:
total += len(node.get("itemIds", []))
total += _count_items(node.get("subGroups", []))
return total
def _now_ts() -> str:
from modules.shared.timeUtils import getUtcTimestamp
return getUtcTimestamp()
# ── Main migration ────────────────────────────────────────────────────────────
def run_migration(dry_run: bool = True, verbose: bool = False):
"""Main migration entry point."""
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
logger.info(f"Starting folder→group migration (dry_run={dry_run})")
from modules.connectors.connectorDbPostgre import getCachedConnector
connector = getCachedConnector()
if not connector or not connector.connection:
logger.error("Could not obtain a DB connection. Aborting.")
return
conn = connector.connection
cur = conn.cursor()
# ── 1. Check that the source tables still exist ───────────────────────────
cur.execute("""
SELECT EXISTS (
SELECT 1 FROM information_schema.tables
WHERE table_name = 'FileFolder'
)
""")
folder_table_exists = cur.fetchone()[0]
cur.execute("""
SELECT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'FileItem' AND column_name = 'folderId'
)
""")
folder_column_exists = cur.fetchone()[0]
if not folder_table_exists and not folder_column_exists:
logger.info("FileFolder table and FileItem.folderId column not found — migration already applied or not needed.")
return
if not folder_table_exists:
logger.warning("FileFolder table missing but FileItem.folderId column still present. Only file assignments will be migrated.")
if not folder_column_exists:
logger.warning("FileItem.folderId column missing but FileFolder table still present. Only group tree structure will be migrated.")
# ── 2. Load all folders ───────────────────────────────────────────────────
folders_by_user: dict = {}
if folder_table_exists:
cur.execute('SELECT "id", "name", "parentId", "sysCreatedBy", "mandateId" FROM "FileFolder"')
for row in cur.fetchall():
fid, fname, parent_id, user_id, mandate_id = row
key = (str(user_id), str(mandate_id) if mandate_id else "")
folders_by_user.setdefault(key, []).append({
"id": fid, "name": fname, "parentId": parent_id,
})
logger.info(f"Loaded folders for {len(folders_by_user)} (user, mandate) combinations")
# ── 3. Load file→folder assignments ──────────────────────────────────────
files_by_key: dict = {}
if folder_column_exists:
cur.execute(
'SELECT "id", "folderId", "sysCreatedBy", "mandateId" FROM "FileItem" WHERE "folderId" IS NOT NULL AND "folderId" != \'\''
)
for row in cur.fetchall():
file_id, folder_id, user_id, mandate_id = row
key = (str(user_id), str(mandate_id) if mandate_id else "")
files_by_key.setdefault(key, {}).setdefault(folder_id, []).append(file_id)
total_files = sum(
sum(len(v) for v in d.values()) for d in files_by_key.values()
)
logger.info(f"Found {total_files} file→folder assignments across {len(files_by_key)} (user, mandate) combos")
# ── 4. Combine and upsert groupings ──────────────────────────────────────
all_keys = set(folders_by_user.keys()) | set(files_by_key.keys())
stats = {"groups_created": 0, "groupings_upserted": 0, "files_assigned": 0}
for key in all_keys:
user_id, mandate_id = key
folders = folders_by_user.get(key, [])
files_by_folder = files_by_key.get(key, {})
# Build tree
roots = _build_tree(folders, None)
roots = _assign_files_to_nodes(roots, files_by_folder)
# Handle files in unknown folders (folder no longer in tree)
known_folder_ids = {f["id"] for f in folders}
for folder_id, file_ids in files_by_folder.items():
if folder_id not in known_folder_ids:
# Orphaned files: put them in an "Orphaned" group
roots.append({
"id": str(uuid.uuid4()),
"name": f"Orphaned (folder {folder_id[:8]}…)",
"itemIds": file_ids,
"subGroups": [],
"meta": {"migratedFromFolderId": folder_id, "orphaned": True},
})
if not roots:
continue
n_items = _count_items(roots)
stats["groups_created"] += len(roots)
stats["files_assigned"] += n_items
context_key = "files/list"
if verbose:
logger.debug(f" user={user_id} mandate={mandate_id}: {len(roots)} root groups, {n_items} files")
if not dry_run:
# Check for existing grouping
cur.execute(
'SELECT "id", "rootGroups" FROM "TableGrouping" WHERE "userId" = %s AND "contextKey" = %s',
(user_id, context_key),
)
existing_row = cur.fetchone()
if existing_row:
existing_id, existing_raw = existing_row
existing_roots = json.loads(existing_raw) if isinstance(existing_raw, str) else (existing_raw or [])
# Merge: append migrated groups (avoid duplicates by migratedFromFolderId)
existing_meta_ids = {
(n.get("meta") or {}).get("migratedFromFolderId")
for n in existing_roots
if (n.get("meta") or {}).get("migratedFromFolderId")
}
new_roots = existing_roots + [
r for r in roots
if (r.get("meta") or {}).get("migratedFromFolderId") not in existing_meta_ids
]
cur.execute(
'UPDATE "TableGrouping" SET "rootGroups" = %s, "updatedAt" = %s WHERE "id" = %s',
(json.dumps(new_roots), _now_ts(), existing_id),
)
else:
new_id = str(uuid.uuid4())
cur.execute(
'INSERT INTO "TableGrouping" ("id", "userId", "contextKey", "rootGroups", "updatedAt") VALUES (%s, %s, %s, %s, %s)',
(new_id, user_id, context_key, json.dumps(roots), _now_ts()),
)
stats["groupings_upserted"] += 1
# ── 5. Summary ────────────────────────────────────────────────────────────
if not dry_run:
conn.commit()
logger.info("Migration committed.")
else:
logger.info("DRY RUN — no changes written.")
logger.info(
f"Summary: groupings_upserted={stats['groupings_upserted']}, "
f"groups_created={stats['groups_created']}, "
f"files_assigned={stats['files_assigned']}"
)
logger.info(
"Next steps (run after verifying data):\n"
" 1. Run Alembic migration to DROP COLUMN FileItem.folderId\n"
" 2. Run Alembic migration to DROP TABLE FileFolder"
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Migrate FileFolder tree to table_groupings")
parser.add_argument("--dry-run", action="store_true", default=True, help="Preview only, no DB writes (default)")
parser.add_argument("--execute", action="store_true", help="Actually write to DB (disables dry-run)")
parser.add_argument("--verbose", action="store_true", help="Show per-user details")
args = parser.parse_args()
dry_run = not args.execute
run_migration(dry_run=dry_run, verbose=args.verbose)

View file

@ -152,10 +152,28 @@ async def get_connections(
- GET /api/connections/?mode=filterValues&column=status
- GET /api/connections/?mode=ids
"""
from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels
from modules.routes.routeHelpers import (
handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels,
handleGroupingInRequest, applyGroupScopeFilter,
)
CONTEXT_KEY = "connections"
# Parse pagination params early — needed for grouping in all modes
paginationParams = None
if pagination:
try:
paginationDict = json.loads(pagination)
if paginationDict:
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
interface = getInterface(currentUser)
groupCtx = handleGroupingInRequest(paginationParams, interface, CONTEXT_KEY)
def _buildEnhancedItems():
interface = getInterface(currentUser)
connections = interface.getUserConnections(currentUser.id)
items = []
for connection in connections:
@ -182,6 +200,7 @@ async def get_connections(
try:
items = _buildEnhancedItems()
enrichRowsWithFkLabels(items, UserConnection)
items = applyGroupScopeFilter(items, groupCtx.itemIds)
return handleFilterValuesInMemory(items, column, pagination)
except Exception as e:
logger.error(f"Error getting filter values for connections: {str(e)}")
@ -189,36 +208,19 @@ async def get_connections(
if mode == "ids":
try:
return handleIdsInMemory(_buildEnhancedItems(), pagination)
items = applyGroupScopeFilter(_buildEnhancedItems(), groupCtx.itemIds)
return handleIdsInMemory(items, pagination)
except Exception as e:
logger.error(f"Error getting IDs for connections: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
try:
interface = getInterface(currentUser)
# NOTE: Cannot use db.getRecordsetPaginated() here because each connection
# is enriched with computed tokenStatus/tokenExpiresAt (requires per-row DB lookup).
# Token refresh also may trigger re-fetch. Connections per user are typically < 10,
# so in-memory pagination is acceptable.
# Parse pagination parameter
paginationParams = None
if pagination:
try:
paginationDict = json.loads(pagination)
if paginationDict:
# Normalize pagination dict (handles top-level "search" field)
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
# SECURITY FIX: All users (including admins) can only see their own connections
# This prevents admin from seeing other users' connections and causing confusion
connections = interface.getUserConnections(currentUser.id)
# Perform silent token refresh for expired OAuth connections
@ -226,26 +228,20 @@ async def get_connections(
refresh_result = await token_refresh_service.refresh_expired_tokens(currentUser.id)
if refresh_result.get("refreshed", 0) > 0:
logger.info(f"Silently refreshed {refresh_result['refreshed']} tokens for user {currentUser.id}")
# Re-fetch connections to get updated token status
connections = interface.getUserConnections(currentUser.id)
except Exception as e:
logger.warning(f"Silent token refresh failed for user {currentUser.id}: {str(e)}")
# Continue with original connections even if refresh fails
# Enhance each connection with token status information and convert to dict
enhanced_connections_dict = []
for connection in connections:
# Get token status for this connection
tokenStatus, tokenExpiresAt = getTokenStatusForConnection(interface, connection.id)
# Convert to dict for filtering/sorting
connection_dict = {
"id": connection.id,
"userId": connection.userId,
"authority": connection.authority.value if hasattr(connection.authority, 'value') else str(connection.authority),
"externalId": connection.externalId,
"externalUsername": connection.externalUsername or "",
"externalEmail": connection.externalEmail, # Keep None instead of converting to empty string
"externalEmail": connection.externalEmail,
"status": connection.status.value if hasattr(connection.status, 'value') else str(connection.status),
"connectedAt": connection.connectedAt,
"lastChecked": connection.lastChecked,
@ -256,11 +252,13 @@ async def get_connections(
enhanced_connections_dict.append(connection_dict)
enrichRowsWithFkLabels(enhanced_connections_dict, UserConnection)
enhanced_connections_dict = applyGroupScopeFilter(enhanced_connections_dict, groupCtx.itemIds)
if paginationParams is None:
return {
"items": enhanced_connections_dict,
"pagination": None,
"groupTree": groupCtx.groupTree,
}
# Apply filtering if provided
@ -298,6 +296,7 @@ async def get_connections(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": groupCtx.groupTree,
}
except HTTPException:
@ -352,10 +351,17 @@ def create_connection(
status=ConnectionStatus.PENDING # Start with PENDING status
)
# Apply knowledge consent + preferences from request body before persisting
knowledge_enabled = connection_data.get("knowledgeIngestionEnabled")
if isinstance(knowledge_enabled, bool):
connection.knowledgeIngestionEnabled = knowledge_enabled
knowledge_prefs = connection_data.get("knowledgePreferences")
if isinstance(knowledge_prefs, dict):
connection.knowledgePreferences = knowledge_prefs
# Save connection record - models now handle timestamp serialization automatically
interface.db.recordModify(UserConnection, connection.id, connection.model_dump())
return connection
except HTTPException:
@ -586,8 +592,25 @@ def disconnect_service(
detail=routeApiMsg("Connection not found")
)
# Update connection status
connection.status = ConnectionStatus.INACTIVE
# Fire revoked event BEFORE DB status change so knowledge purge and
# status mutation form one logical step; subscribers see the
# connection as it was. INACTIVE does not exist on the enum — REVOKED
# is the correct terminal-but-retained state (deleted rows are
# handled in DELETE /{id}).
try:
from modules.shared.callbackRegistry import callbackRegistry
callbackRegistry.trigger(
"connection.revoked",
connectionId=connectionId,
authority=str(getattr(connection.authority, "value", connection.authority) or ""),
userId=str(currentUser.id),
reason="disconnected",
)
except Exception as _cbErr:
logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr)
connection.status = ConnectionStatus.REVOKED
connection.lastChecked = getUtcTimestamp()
# Update connection record - models now handle timestamp serialization automatically
@ -636,6 +659,23 @@ def delete_connection(
detail=routeApiMsg("Connection not found")
)
# Fire revoked event BEFORE the row disappears so consumers still
# have authority/connection context for observability; purge itself
# targets FileContentIndex rows by connectionId which are unaffected
# by the UserConnection delete.
try:
from modules.shared.callbackRegistry import callbackRegistry
callbackRegistry.trigger(
"connection.revoked",
connectionId=connectionId,
authority=str(getattr(connection.authority, "value", connection.authority) or ""),
userId=str(currentUser.id),
reason="deleted",
)
except Exception as _cbErr:
logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr)
# Remove the connection - only need connectionId since permissions are verified
interface.removeUserConnection(connectionId)

View file

@ -12,7 +12,6 @@ from modules.auth import limiter, getCurrentUser, getRequestContext, RequestCont
# Import interfaces
import modules.interfaces.interfaceDbManagement as interfaceDbManagement
from modules.datamodels.datamodelFiles import FileItem, FilePreview
from modules.datamodels.datamodelFileFolder import FileFolder
from modules.shared.attributeUtils import getModelAttributeDefinitions
from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict
@ -77,7 +76,7 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
"""Background task: pre-scan + extraction + knowledge indexing.
Step 1: Structure Pre-Scan (AI-free) -> FileContentIndex (persisted)
Step 2: Content extraction via runExtraction -> ContentParts
Step 3: KnowledgeService.indexFile -> chunking + embedding -> Knowledge Store"""
Step 3: KnowledgeService.requestIngestion -> idempotent chunking + embedding -> Knowledge Store"""
userId = user.id if hasattr(user, "id") else str(user)
try:
mgmtInterface = interfaceDbManagement.getInterface(user)
@ -122,9 +121,30 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
f"{contentIndex.totalObjects} objects"
)
# Persist FileContentIndex immediately
# Persist FileContentIndex immediately.
# IMPORTANT: preserve `_ingestion` metadata and `status="indexed"` from any
# prior successful run — otherwise this upsert wipes the idempotency cache
# and requestIngestion cannot detect duplicates (AC4 breaks).
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
knowledgeDb = getKnowledgeInterface()
try:
_existing = knowledgeDb.getFileContentIndex(fileId)
except Exception:
_existing = None
if _existing:
_existingStruct = (
_existing.get("structure") if isinstance(_existing, dict)
else getattr(_existing, "structure", {})
) or {}
_existingStatus = (
_existing.get("status") if isinstance(_existing, dict)
else getattr(_existing, "status", "")
) or ""
if "_ingestion" in _existingStruct:
contentIndex.structure = dict(contentIndex.structure or {})
contentIndex.structure["_ingestion"] = _existingStruct["_ingestion"]
if _existingStatus == "indexed":
contentIndex.status = "indexed"
knowledgeDb.upsertFileContentIndex(contentIndex)
# Step 2: Content extraction (AI-free, produces ContentParts)
@ -134,7 +154,10 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
options = ExtractionOptions()
# mergeStrategy=None: keep per-page / per-section granularity for RAG ingestion.
# The default MergeStrategy concatenates all text parts into a single blob, which
# collapses a 500-page PDF into one ContentChunk and destroys semantic retrieval.
options = ExtractionOptions(mergeStrategy=None)
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
@ -181,15 +204,21 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
)
knowledgeService = getService("knowledge", ctx)
await knowledgeService.indexFile(
fileId=fileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
mandateId=str(mandate_id) if mandate_id else "",
contentObjects=contentObjects,
structure=contentIndex.structure,
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="file",
sourceId=fileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
mandateId=str(mandate_id) if mandate_id else "",
contentObjects=contentObjects,
structure=contentIndex.structure,
provenance={"lane": "upload", "route": "routeDataFiles._autoIndexFile"},
)
)
# Re-acquire interface after await to avoid stale user context from the singleton
@ -249,7 +278,6 @@ def get_files(
try:
paginationDict = json.loads(pagination)
if paginationDict:
# Normalize pagination dict (handles top-level "search" field)
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
@ -261,47 +289,39 @@ def get_files(
from modules.routes.routeHelpers import (
handleIdsMode,
handleFilterValuesInMemory,
handleGroupingInRequest, applyGroupScopeFilter,
)
import modules.interfaces.interfaceDbApp as _appIface
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
appInterface = _appIface.getInterface(currentUser)
groupCtx = handleGroupingInRequest(paginationParams, appInterface, "files/list")
def _filesToDicts(fileItems):
return [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in fileItems]
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
allFiles = managementInterface.getAllFiles()
items = allFiles if isinstance(allFiles, list) else (allFiles.items if hasattr(allFiles, "items") else [])
itemDicts = [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in items]
itemDicts = _filesToDicts(items)
enrichRowsWithFkLabels(itemDicts, FileItem)
itemDicts = applyGroupScopeFilter(itemDicts, groupCtx.itemIds)
return handleFilterValuesInMemory(itemDicts, column, pagination)
if mode == "ids":
recordFilter = {"sysCreatedBy": managementInterface.userId}
return handleIdsMode(managementInterface.db, FileItem, pagination, recordFilter)
recordFilter = None
if paginationParams and paginationParams.filters and "folderId" in paginationParams.filters:
fVal = paginationParams.filters.get("folderId")
# For a concrete folderId we use recordFilter (exact equality).
# For null / empty (= "root") we keep it in pagination.filters so the
# connector applies `IS NULL OR = ''` files predating the folderId
# fix were stored with an empty string instead of NULL.
if fVal is None or (isinstance(fVal, str) and fVal.strip() == ""):
paginationParams.filters["folderId"] = None
else:
paginationParams.filters.pop("folderId")
recordFilter = {"folderId": fVal}
result = managementInterface.getAllFiles(pagination=paginationParams, recordFilter=recordFilter)
def _filesToDicts(items):
return [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in items]
result = managementInterface.getAllFiles(pagination=paginationParams)
if paginationParams:
enriched = enrichRowsWithFkLabels(_filesToDicts(result.items), FileItem)
enriched = applyGroupScopeFilter(enrichRowsWithFkLabels(_filesToDicts(result.items), FileItem), groupCtx.itemIds)
return {
"items": enriched,
"pagination": PaginationMetadata(
@ -312,11 +332,12 @@ def get_files(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": groupCtx.groupTree,
}
else:
items = result if isinstance(result, list) else (result.items if hasattr(result, "items") else [result])
enriched = enrichRowsWithFkLabels(_filesToDicts(items), FileItem)
return {"items": enriched, "pagination": None}
enriched = applyGroupScopeFilter(enrichRowsWithFkLabels(_filesToDicts(items), FileItem), groupCtx.itemIds)
return {"items": enriched, "pagination": None, "groupTree": groupCtx.groupTree}
except HTTPException:
raise
except Exception as e:
@ -327,6 +348,36 @@ def get_files(
)
def _addFileToGroup(appInterface, fileId: str, groupId: str, contextKey: str = "files/list"):
"""Add a file to a group in the persisted groupTree (upsert)."""
from modules.routes.routeHelpers import _collectItemIds
try:
existing = appInterface.getTableGrouping(contextKey)
if not existing:
return
nodes = [n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups]
def _add(nds):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
if nid == groupId:
itemIds = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
if fileId not in itemIds:
itemIds.append(fileId)
if isinstance(nd, dict):
nd["itemIds"] = itemIds
else:
nd.itemIds = itemIds
return True
subs = nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])
if _add(subs):
return True
return False
_add(nodes)
appInterface.upsertTableGrouping(contextKey, nodes)
except Exception as e:
logger.warning(f"_addFileToGroup failed: {e}")
@router.post("/upload", status_code=status.HTTP_201_CREATED)
@limiter.limit("10/minute")
async def upload_file(
@ -334,7 +385,7 @@ async def upload_file(
file: UploadFile = File(...),
workflowId: Optional[str] = Form(None),
featureInstanceId: Optional[str] = Form(None),
folderId: Optional[str] = Form(None),
groupId: Optional[str] = Form(None),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext),
) -> JSONResponse:
@ -359,29 +410,20 @@ async def upload_file(
detail=f"File too large. Maximum size: {interfaceDbManagement.APP_CONFIG.get('File_Management_MAX_UPLOAD_SIZE_MB')}MB"
)
# Normalize folderId: empty string / "null" / "root" → None (root folder)
normalizedFolderId: Optional[str] = folderId
if isinstance(normalizedFolderId, str):
trimmed = normalizedFolderId.strip()
if not trimmed or trimmed.lower() in {"null", "none", "root"}:
normalizedFolderId = None
else:
normalizedFolderId = trimmed
# Save file via LucyDOM interface in the database
fileItem, duplicateType = managementInterface.saveUploadedFile(
fileContent, file.filename, folderId=normalizedFolderId
fileContent, file.filename
)
if featureInstanceId and not fileItem.featureInstanceId:
managementInterface.updateFile(fileItem.id, {"featureInstanceId": featureInstanceId})
fileItem.featureInstanceId = featureInstanceId
# For exact duplicates we keep the existing record, but move it into the
# target folder so the user actually sees their upload land where they expect.
if duplicateType == "exact_duplicate" and normalizedFolderId != getattr(fileItem, "folderId", None):
managementInterface.updateFile(fileItem.id, {"folderId": normalizedFolderId})
fileItem.folderId = normalizedFolderId
# Add to group if groupId was provided
if groupId:
import modules.interfaces.interfaceDbApp as _appIface
appInterface = _appIface.getInterface(currentUser)
_addFileToGroup(appInterface, fileItem.id, groupId)
# Determine response message based on duplicate type
if duplicateType == "exact_duplicate":
@ -447,347 +489,6 @@ async def upload_file(
detail=f"Error during file upload: {str(e)}"
)
# ── Folder endpoints (MUST be before /{fileId} catch-all) ─────────────────────
@router.get("/folders", response_model=List[Dict[str, Any]])
@limiter.limit("30/minute")
def list_folders(
request: Request,
parentId: Optional[str] = Query(None, description="Parent folder ID (omit for all folders)"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> List[Dict[str, Any]]:
"""List folders for the current user."""
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
if parentId is not None:
return mgmt.listFolders(parentId=parentId)
return mgmt.listFolders()
except Exception as e:
logger.error(f"Error listing folders: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/folders", status_code=status.HTTP_201_CREATED)
@limiter.limit("10/minute")
def create_folder(
request: Request,
body: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Create a new folder."""
name = body.get("name", "")
parentId = body.get("parentId")
if not name:
raise HTTPException(status_code=400, detail=routeApiMsg("name is required"))
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
return mgmt.createFolder(name=name, parentId=parentId)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Error creating folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.put("/folders/{folderId}")
@limiter.limit("10/minute")
def rename_folder(
request: Request,
folderId: str = Path(...),
body: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Rename a folder."""
newName = body.get("name", "")
if not newName:
raise HTTPException(status_code=400, detail=routeApiMsg("name is required"))
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
mgmt.renameFolder(folderId, newName)
return {"success": True, "folderId": folderId, "name": newName}
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Error renaming folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/folders/{folderId}")
@limiter.limit("10/minute")
def delete_folder(
request: Request,
folderId: str = Path(...),
recursive: bool = Query(False, description="Delete folder contents recursively"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Delete a folder. Use recursive=true to delete non-empty folders."""
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
return mgmt.deleteFolder(folderId, recursive=recursive)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Error deleting folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/folders/{folderId}/move")
@limiter.limit("10/minute")
def move_folder(
request: Request,
folderId: str = Path(...),
body: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Move a folder to a new parent."""
targetParentId = body.get("targetParentId")
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
mgmt.moveFolder(folderId, targetParentId)
return {"success": True, "folderId": folderId, "parentId": targetParentId}
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Error moving folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.patch("/folders/{folderId}/scope")
@limiter.limit("10/minute")
def _updateFolderScope(
request: Request,
folderId: str = Path(..., description="ID of the folder"),
scope: str = Body(..., embed=True),
context: RequestContext = Depends(getRequestContext),
) -> Dict[str, Any]:
"""Update the scope of a folder. Propagates to all files inside (recursively). Global scope requires sysAdmin."""
validScopes = {"personal", "featureInstance", "mandate", "global"}
if scope not in validScopes:
raise HTTPException(status_code=400, detail=f"Invalid scope: {scope}. Must be one of {validScopes}")
if scope == "global" and not context.isSysAdmin:
raise HTTPException(status_code=403, detail=routeApiMsg("Only sysadmins can set global scope"))
try:
mgmt = interfaceDbManagement.getInterface(
context.user,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
folder = mgmt.getFolder(folderId)
if not folder:
raise HTTPException(status_code=404, detail=routeApiMsg("Folder not found"))
mgmt.updateFolder(folderId, {"scope": scope})
fileIds = _collectFolderFileIds(mgmt, folderId)
for fid in fileIds:
try:
mgmt.updateFile(fid, {"scope": scope})
except Exception as e:
logger.error("Folder scope propagation: failed to update file %s: %s", fid, e)
logger.info("Updated scope=%s for folder %s: %d files affected", scope, folderId, len(fileIds))
return {"folderId": folderId, "scope": scope, "filesUpdated": len(fileIds)}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error updating folder scope: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.patch("/folders/{folderId}/neutralize")
@limiter.limit("10/minute")
def updateFolderNeutralize(
request: Request,
background_tasks: BackgroundTasks,
folderId: str = Path(..., description="ID of the folder"),
neutralize: bool = Body(..., embed=True),
context: RequestContext = Depends(getRequestContext),
) -> Dict[str, Any]:
"""Toggle neutralization on a folder. Propagates to all files inside (recursively).
When turning ON: all files in the folder get ``neutralize=True``, their
knowledge indexes are purged synchronously, and background re-indexing
is triggered.
When turning OFF: files revert to ``neutralize=False`` unless they were
individually marked (not implemented yet -- all are reverted).
"""
try:
mgmt = interfaceDbManagement.getInterface(
context.user,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
folder = mgmt.getFolder(folderId)
if not folder:
raise HTTPException(status_code=404, detail=routeApiMsg("Folder not found"))
mgmt.updateFolder(folderId, {"neutralize": neutralize})
fileIds = _collectFolderFileIds(mgmt, folderId)
logger.info("Folder neutralize toggle %s for folder %s: %d files affected", neutralize, folderId, len(fileIds))
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
knowledgeDb = getKnowledgeInterface()
for fid in fileIds:
try:
mgmt.updateFile(fid, {"neutralize": neutralize})
if neutralize:
try:
knowledgeDb.deleteFileContentIndex(fid)
except Exception as e:
logger.warning("Folder neutralize: failed to purge index for file %s: %s", fid, e)
else:
try:
from modules.datamodels.datamodelKnowledge import FileContentIndex
indices = knowledgeDb.db.getRecordset(FileContentIndex, recordFilter={"id": fid})
for idx in indices:
idxId = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
if idxId:
knowledgeDb.db.recordModify(FileContentIndex, idxId, {
"neutralizationStatus": "original",
"isNeutralized": False,
})
except Exception as e:
logger.warning("Folder neutralize OFF: metadata update failed for %s: %s", fid, e)
except Exception as e:
logger.error("Folder neutralize: failed to update file %s: %s", fid, e)
for fid in fileIds:
fileMeta = mgmt.getFile(fid)
if fileMeta:
fn = fileMeta.fileName if hasattr(fileMeta, "fileName") else fileMeta.get("fileName", "")
mt = fileMeta.mimeType if hasattr(fileMeta, "mimeType") else fileMeta.get("mimeType", "")
async def _reindex(fileId=fid, fileName=fn, mimeType=mt):
try:
await _autoIndexFile(fileId=fileId, fileName=fileName, mimeType=mimeType, user=context.user)
except Exception as ex:
logger.error("Folder neutralize re-index failed for %s: %s", fileId, ex)
background_tasks.add_task(_reindex)
return {"folderId": folderId, "neutralize": neutralize, "filesUpdated": len(fileIds)}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error updating folder neutralize flag: {e}")
raise HTTPException(status_code=500, detail=str(e))
def _collectFolderFileIds(mgmt, folderId: str) -> List[str]:
"""Recursively collect all file IDs in a folder and its sub-folders."""
fileIds = []
try:
files = mgmt.listFiles(folderId=folderId)
if isinstance(files, dict):
files = files.get("files", [])
for f in (files or []):
fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None)
if fid:
fileIds.append(fid)
except Exception as e:
logger.warning("_collectFolderFileIds: listFiles failed for folder %s: %s", folderId, e)
try:
subFolders = mgmt.listFolders(parentId=folderId)
for sf in (subFolders or []):
sfId = sf.get("id") if isinstance(sf, dict) else getattr(sf, "id", None)
if sfId:
fileIds.extend(_collectFolderFileIds(mgmt, sfId))
except Exception as e:
logger.warning("_collectFolderFileIds: listFolders failed for folder %s: %s", folderId, e)
return fileIds
@router.get("/folders/{folderId}/download")
@limiter.limit("10/minute")
def download_folder(
request: Request,
folderId: str = Path(..., description="ID of the folder to download as ZIP"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Response:
"""Download a folder (including subfolders) as a ZIP archive."""
import io
import zipfile
import urllib.parse
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
folder = mgmt.getFolder(folderId)
if not folder:
raise HTTPException(status_code=404, detail=f"Folder {folderId} not found")
folderName = folder.get("name", "download")
def _collectFiles(parentId: str, pathPrefix: str):
"""Recursively collect (zipPath, fileId) tuples."""
entries = []
for f in mgmt._getFilesByCurrentUser(recordFilter={"folderId": parentId}):
fname = f.get("fileName") or f.get("name") or f.get("id", "file")
entries.append((f"{pathPrefix}{fname}", f["id"]))
for sub in mgmt.listFolders(parentId=parentId):
subName = sub.get("name", sub["id"])
entries.extend(_collectFiles(sub["id"], f"{pathPrefix}{subName}/"))
return entries
fileEntries = _collectFiles(folderId, "")
if not fileEntries:
raise HTTPException(status_code=404, detail=routeApiMsg("Folder is empty"))
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
for zipPath, fileId in fileEntries:
data = mgmt.getFileData(fileId)
if data:
zf.writestr(zipPath, data)
buf.seek(0)
zipBytes = buf.getvalue()
encodedName = urllib.parse.quote(f"{folderName}.zip")
return Response(
content=zipBytes,
media_type="application/zip",
headers={
"Content-Disposition": f"attachment; filename*=UTF-8''{encodedName}"
}
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error downloading folder as ZIP: {e}")
raise HTTPException(status_code=500, detail=f"Error downloading folder: {str(e)}")
@router.post("/batch-delete")
@ -798,13 +499,11 @@ def batch_delete_items(
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Batch delete files/folders with a single SQL-backed operation per type."""
"""Batch delete files."""
fileIds = body.get("fileIds") or []
folderIds = body.get("folderIds") or []
recursiveFolders = bool(body.get("recursiveFolders", True))
if not isinstance(fileIds, list) or not isinstance(folderIds, list):
raise HTTPException(status_code=400, detail=routeApiMsg("fileIds and folderIds must be arrays"))
if not isinstance(fileIds, list):
raise HTTPException(status_code=400, detail=routeApiMsg("fileIds must be an array"))
try:
mgmt = interfaceDbManagement.getInterface(
@ -813,17 +512,12 @@ def batch_delete_items(
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
result = {"deletedFiles": 0, "deletedFolders": 0}
result = {"deletedFiles": 0}
if fileIds:
fileResult = mgmt.deleteFilesBatch(fileIds)
result["deletedFiles"] += fileResult.get("deletedFiles", 0)
if folderIds:
folderResult = mgmt.deleteFoldersBatch(folderIds, recursive=recursiveFolders)
result["deletedFiles"] += folderResult.get("deletedFiles", 0)
result["deletedFolders"] += folderResult.get("deletedFolders", 0)
return result
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
@ -832,45 +526,189 @@ def batch_delete_items(
raise HTTPException(status_code=500, detail=str(e))
@router.post("/batch-move")
@limiter.limit("10/minute")
def batch_move_items(
request: Request,
body: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Batch move files/folders with a single SQL-backed operation per type."""
fileIds = body.get("fileIds") or []
folderIds = body.get("folderIds") or []
targetFolderId = body.get("targetFolderId")
targetParentId = body.get("targetParentId")
if not isinstance(fileIds, list) or not isinstance(folderIds, list):
raise HTTPException(status_code=400, detail=routeApiMsg("fileIds and folderIds must be arrays"))
# ── Group bulk endpoints ──────────────────────────────────────────────────────
def _get_group_item_ids(contextKey: str, groupId: str, appInterface) -> set:
"""Collect all file IDs in a group and its sub-groups from the stored groupTree."""
from modules.routes.routeHelpers import _collectItemIds
try:
mgmt = interfaceDbManagement.getInterface(
existing = appInterface.getTableGrouping(contextKey)
if not existing:
return set()
nodes = [n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups]
result = _collectItemIds(nodes, groupId)
return result or set()
except Exception as e:
logger.error(f"_get_group_item_ids failed for groupId={groupId}: {e}")
return set()
@router.patch("/groups/{groupId}/scope")
@limiter.limit("60/minute")
def patch_group_scope(
request: Request,
groupId: str = Path(..., description="Group ID"),
body: dict = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext),
):
"""Set scope for all files in a group (recursive)."""
scope = body.get("scope")
if not scope:
raise HTTPException(status_code=400, detail="scope is required")
try:
import modules.interfaces.interfaceDbApp as _appIface
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
result = {"movedFiles": 0, "movedFolders": 0}
if fileIds:
fileResult = mgmt.moveFilesBatch(fileIds, targetFolderId=targetFolderId)
result["movedFiles"] += fileResult.get("movedFiles", 0)
if folderIds:
folderResult = mgmt.moveFoldersBatch(folderIds, targetParentId=targetParentId)
result["movedFolders"] += folderResult.get("movedFolders", 0)
return result
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
appInterface = _appIface.getInterface(currentUser)
fileIds = _get_group_item_ids("files/list", groupId, appInterface)
updated = 0
for fid in fileIds:
try:
managementInterface.updateFile(fid, {"scope": scope})
updated += 1
except Exception as e:
logger.error(f"patch_group_scope: failed to update file {fid}: {e}")
return {"groupId": groupId, "scope": scope, "filesUpdated": updated}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in batch move: {e}")
logger.error(f"patch_group_scope error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.patch("/groups/{groupId}/neutralize")
@limiter.limit("60/minute")
def patch_group_neutralize(
request: Request,
groupId: str = Path(..., description="Group ID"),
body: dict = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext),
):
"""Toggle neutralize for all files in a group (recursive, incl. knowledge purge/reindex)."""
neutralize = body.get("neutralize")
if neutralize is None:
raise HTTPException(status_code=400, detail="neutralize is required")
try:
import modules.interfaces.interfaceDbApp as _appIface
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
appInterface = _appIface.getInterface(currentUser)
fileIds = _get_group_item_ids("files/list", groupId, appInterface)
updated = 0
for fid in fileIds:
try:
managementInterface.updateFile(fid, {"neutralize": neutralize})
if not neutralize:
try:
from modules.interfaces import interfaceDbKnowledge
kIface = interfaceDbKnowledge.getInterface(currentUser)
kIface.purgeFileKnowledge(fid)
except Exception as ke:
logger.warning(f"patch_group_neutralize: knowledge purge failed for {fid}: {ke}")
updated += 1
except Exception as e:
logger.error(f"patch_group_neutralize: failed for file {fid}: {e}")
return {"groupId": groupId, "neutralize": neutralize, "filesUpdated": updated}
except HTTPException:
raise
except Exception as e:
logger.error(f"patch_group_neutralize error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.get("/groups/{groupId}/download")
@limiter.limit("20/minute")
async def download_group_zip(
request: Request,
groupId: str = Path(..., description="Group ID"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext),
):
"""Download all files in a group as a ZIP archive."""
import io, zipfile
try:
import modules.interfaces.interfaceDbApp as _appIface
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
appInterface = _appIface.getInterface(currentUser)
fileIds = _get_group_item_ids("files/list", groupId, appInterface)
if not fileIds:
raise HTTPException(status_code=404, detail="Group not found or empty")
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
for fid in fileIds:
try:
fileMeta = managementInterface.getFile(fid)
fileData = managementInterface.getFileData(fid)
if fileMeta and fileData:
name = (fileMeta.get("fileName") if isinstance(fileMeta, dict) else getattr(fileMeta, "fileName", fid)) or fid
zf.writestr(name, fileData)
except Exception as fe:
logger.warning(f"download_group_zip: skipping file {fid}: {fe}")
buf.seek(0)
from fastapi.responses import StreamingResponse
return StreamingResponse(
buf,
media_type="application/zip",
headers={"Content-Disposition": f'attachment; filename="group-{groupId}.zip"'},
)
except HTTPException:
raise
except Exception as e:
logger.error(f"download_group_zip error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/groups/{groupId}")
@limiter.limit("30/minute")
def delete_group(
request: Request,
groupId: str = Path(..., description="Group ID"),
deleteItems: bool = Query(False, description="If true, also delete all files in the group"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext),
):
"""Remove a group from the groupTree. Optionally delete all its files."""
try:
import modules.interfaces.interfaceDbApp as _appIface
appInterface = _appIface.getInterface(currentUser)
fileIds = _get_group_item_ids("files/list", groupId, appInterface)
# Remove group from tree
existing = appInterface.getTableGrouping("files/list")
if existing:
from modules.routes.routeHelpers import _removeGroupFromTree
newRoots = _removeGroupFromTree([n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups], groupId)
appInterface.upsertTableGrouping("files/list", newRoots)
# Optionally delete files
deletedFiles = 0
if deleteItems:
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
for fid in fileIds:
try:
managementInterface.deleteFile(fid)
deletedFiles += 1
except Exception as e:
logger.error(f"delete_group: failed to delete file {fid}: {e}")
return {"groupId": groupId, "deletedFiles": deletedFiles}
except HTTPException:
raise
except Exception as e:
logger.error(f"delete_group error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@ -1071,7 +909,7 @@ def update_file(
) -> FileItem:
"""Update file info"""
try:
_EDITABLE_FIELDS = {"fileName", "scope", "tags", "description", "folderId", "neutralize"}
_EDITABLE_FIELDS = {"fileName", "scope", "tags", "description", "neutralize"}
safeData = {k: v for k, v in file_info.items() if k in _EDITABLE_FIELDS}
if not safeData:
raise HTTPException(status_code=400, detail=routeApiMsg("No editable fields provided"))
@ -1226,37 +1064,3 @@ def preview_file(
)
@router.post("/{fileId}/move")
@limiter.limit("10/minute")
def move_file(
request: Request,
fileId: str = Path(...),
body: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Move a file to a different folder."""
targetFolderId = body.get("targetFolderId")
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
mgmt.updateFile(fileId, {"folderId": targetFolderId})
if targetFolderId:
try:
targetFolder = mgmt.getFolder(targetFolderId)
folderNeut = (targetFolder.get("neutralize") if isinstance(targetFolder, dict)
else getattr(targetFolder, "neutralize", False)) if targetFolder else False
if folderNeut:
mgmt.updateFile(fileId, {"neutralize": True})
logger.info("File %s moved to neutralized folder %s — inherited neutralize=True", fileId, targetFolderId)
except Exception as e:
logger.warning("File move: folder neutralize inheritance check failed for %s: %s", fileId, e)
return {"success": True, "fileId": fileId, "folderId": targetFolderId}
except Exception as e:
logger.error(f"Error moving file: {e}")
raise HTTPException(status_code=500, detail=str(e))

View file

@ -113,7 +113,7 @@ def get_mandates(
detail=routeApiMsg("Admin role required")
)
# Parse pagination parameter
# Parse pagination parameter early — needed for grouping in all modes
paginationParams = None
if pagination:
try:
@ -131,9 +131,19 @@ def get_mandates(
handleFilterValuesInMemory, handleIdsInMemory,
handleFilterValuesMode, handleIdsMode,
parseCrossFilterPagination,
handleGroupingInRequest, applyGroupScopeFilter,
)
appInterface = interfaceDbApp.getRootInterface()
groupCtx = handleGroupingInRequest(paginationParams, appInterface, "mandates")
def _mandateItemsForAdmin():
items = []
for mid in adminMandateIds:
m = appInterface.getMandate(mid)
if m and getattr(m, "enabled", True):
items.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
return items
if mode == "filterValues":
if not column:
@ -144,54 +154,42 @@ def get_mandates(
values = appInterface.db.getDistinctColumnValues(Mandate, column, crossPagination)
return JSONResponse(content=sorted(values, key=lambda v: str(v).lower()))
else:
mandateItems = []
for mid in adminMandateIds:
m = appInterface.getMandate(mid)
if m and getattr(m, "enabled", True):
mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
return handleFilterValuesInMemory(mandateItems, column, pagination)
if mode == "ids":
if isPlatformAdmin:
return handleIdsMode(appInterface.db, Mandate, pagination)
else:
mandateItems = []
for mid in adminMandateIds:
m = appInterface.getMandate(mid)
if m and getattr(m, "enabled", True):
mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
return handleIdsInMemory(mandateItems, pagination)
if isPlatformAdmin:
result = appInterface.getAllMandates(pagination=paginationParams)
else:
allMandates = []
for mandateId in adminMandateIds:
mandate = appInterface.getMandate(mandateId)
if mandate and getattr(mandate, "enabled", True):
mandateDict = mandate if isinstance(mandate, dict) else mandate.model_dump() if hasattr(mandate, 'model_dump') else vars(mandate)
allMandates.append(mandateDict)
result = allMandates
paginationParams = None
if paginationParams and hasattr(result, 'items'):
return PaginatedResponse(
items=result.items,
pagination=PaginationMetadata(
currentPage=paginationParams.page,
pageSize=paginationParams.pageSize,
totalItems=result.totalItems,
totalPages=result.totalPages,
sort=paginationParams.sort,
filters=paginationParams.filters
items = result.items if hasattr(result, 'items') else (result if isinstance(result, list) else [])
items = applyGroupScopeFilter(
[i.model_dump() if hasattr(i, 'model_dump') else (i if isinstance(i, dict) else vars(i)) for i in items],
groupCtx.itemIds,
)
if paginationParams and hasattr(result, 'items'):
return PaginatedResponse(
items=items,
pagination=PaginationMetadata(
currentPage=paginationParams.page,
pageSize=paginationParams.pageSize,
totalItems=result.totalItems,
totalPages=result.totalPages,
sort=paginationParams.sort,
filters=paginationParams.filters
),
groupTree=groupCtx.groupTree,
)
)
else:
return PaginatedResponse(items=items, pagination=None, groupTree=groupCtx.groupTree)
else:
items = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else result)
return PaginatedResponse(
items=items,
pagination=None
)
mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
return PaginatedResponse(items=mandateItems, pagination=None, groupTree=groupCtx.groupTree)
except HTTPException:
raise
except Exception as e:

View file

@ -44,27 +44,15 @@ def get_prompts(
- filterValues: distinct values for a column (cross-filtered)
- ids: all IDs matching current filters
"""
from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels
from modules.routes.routeHelpers import (
handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels,
handleGroupingInRequest, applyGroupScopeFilter,
)
from modules.interfaces.interfaceDbApp import getInterface as getAppInterface
def _promptsToEnrichedDicts(promptItems):
dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems]
enrichRowsWithFkLabels(dicts, Prompt)
return dicts
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
managementInterface = interfaceDbManagement.getInterface(currentUser)
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
return handleFilterValuesInMemory(items, column, pagination)
if mode == "ids":
managementInterface = interfaceDbManagement.getInterface(currentUser)
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
return handleIdsInMemory(items, pagination)
CONTEXT_KEY = "prompts"
# Parse pagination params early — needed for grouping in all modes
paginationParams = None
if pagination:
try:
@ -75,11 +63,34 @@ def get_prompts(
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
appInterface = getAppInterface(currentUser)
groupCtx = handleGroupingInRequest(paginationParams, appInterface, CONTEXT_KEY)
def _promptsToEnrichedDicts(promptItems):
dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems]
enrichRowsWithFkLabels(dicts, Prompt)
return dicts
managementInterface = interfaceDbManagement.getInterface(currentUser)
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
items = applyGroupScopeFilter(items, groupCtx.itemIds)
return handleFilterValuesInMemory(items, column, pagination)
if mode == "ids":
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
items = applyGroupScopeFilter(items, groupCtx.itemIds)
return handleIdsInMemory(items, pagination)
result = managementInterface.getAllPrompts(pagination=paginationParams)
if paginationParams:
items = _promptsToEnrichedDicts(result.items)
items = applyGroupScopeFilter(_promptsToEnrichedDicts(result.items), groupCtx.itemIds)
return {
"items": items,
"pagination": PaginationMetadata(
@ -90,12 +101,14 @@ def get_prompts(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": groupCtx.groupTree,
}
else:
items = _promptsToEnrichedDicts(result)
items = applyGroupScopeFilter(_promptsToEnrichedDicts(result), groupCtx.itemIds)
return {
"items": items,
"pagination": None,
"groupTree": groupCtx.groupTree,
}

View file

@ -208,6 +208,21 @@ def get_users(
- GET /api/users/ (no pagination - returns all users in mandate)
- GET /api/users/?pagination={"page":1,"pageSize":10,"sort":[]}
"""
# Parse pagination early — needed for grouping in all modes
_paginationParams = None
if pagination:
try:
_pd = json.loads(pagination)
if _pd:
_pd = normalize_pagination_dict(_pd)
_paginationParams = PaginationParams(**_pd)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
from modules.routes.routeHelpers import handleGroupingInRequest as _handleGrouping, applyGroupScopeFilter as _applyGroupScope
_appInterfaceForGrouping = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId)
_groupCtx = _handleGrouping(_paginationParams, _appInterfaceForGrouping, "users")
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
@ -217,27 +232,15 @@ def get_users(
return _getUserFilterOrIds(context, pagination, idsMode=True)
try:
paginationParams = None
if pagination:
try:
paginationDict = json.loads(pagination)
if paginationDict:
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
appInterface = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId)
paginationParams = _paginationParams
appInterface = _appInterfaceForGrouping
if context.mandateId:
# Get users for specific mandate using getUsersByMandate
result = appInterface.getUsersByMandate(str(context.mandateId), paginationParams)
if paginationParams and hasattr(result, 'items'):
enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User)
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds)
return {
"items": enriched,
"pagination": PaginationMetadata(
@ -248,17 +251,18 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": _groupCtx.groupTree,
}
else:
users = result if isinstance(result, list) else result.items if hasattr(result, 'items') else []
enriched = enrichRowsWithFkLabels(_usersToDicts(users), User)
return {"items": enriched, "pagination": None}
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds)
return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
elif context.isPlatformAdmin:
# PlatformAdmin without mandateId — DB-level pagination via interface
result = appInterface.getAllUsers(paginationParams)
if paginationParams and hasattr(result, 'items'):
enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User)
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds)
return {
"items": enriched,
"pagination": PaginationMetadata(
@ -269,11 +273,12 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": _groupCtx.groupTree,
}
else:
users = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else [])
enriched = enrichRowsWithFkLabels(_usersToDicts(users), User)
return {"items": enriched, "pagination": None}
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds)
return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
else:
# Non-SysAdmin without mandateId: aggregate users across all admin mandates
rootInterface = getRootInterface()
@ -313,7 +318,7 @@ def get_users(
]
from modules.routes.routeHelpers import applyFiltersAndSort as _applyFiltersAndSortHelper
filteredUsers = _applyFiltersAndSortHelper(allUsers, paginationParams)
filteredUsers = _applyGroupScope(_applyFiltersAndSortHelper(allUsers, paginationParams), _groupCtx.itemIds)
enriched = enrichRowsWithFkLabels(filteredUsers, User)
if paginationParams:
@ -333,9 +338,10 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": _groupCtx.groupTree,
}
else:
return {"items": enriched, "pagination": None}
return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
except HTTPException:
raise
except Exception as e:

View file

@ -701,3 +701,157 @@ def paginateInMemory(
offset = (paginationParams.page - 1) * paginationParams.pageSize
pageItems = items[offset:offset + paginationParams.pageSize]
return pageItems, totalItems
# ---------------------------------------------------------------------------
# Table Grouping helpers
# ---------------------------------------------------------------------------
from dataclasses import dataclass, field as dc_field
@dataclass
class GroupingContext:
"""
Result of handleGroupingInRequest.
Carries the group tree for the response and the resolved item-ID set for
group-scope filtering (None = no active group scope).
"""
groupTree: Optional[list] # List[TableGroupNode] serialised as dicts — for response
itemIds: Optional[set] # Set[str] when groupId was set, else None
def _collectItemIds(nodes: list, groupId: str) -> Optional[set]:
"""
Recursively search *nodes* for a node whose id == groupId and collect
all itemIds from it and all its descendant subGroups.
Returns None if the group is not found.
"""
for node in nodes:
nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None)
if nodeId == groupId:
ids: set = set()
_collectAllIds(node, ids)
return ids
subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
result = _collectItemIds(subGroups, groupId)
if result is not None:
return result
return None
def _collectAllIds(node, ids: set) -> None:
"""Collect itemIds from a node and all its descendants into ids."""
nodeItemIds = node.get("itemIds", []) if isinstance(node, dict) else getattr(node, "itemIds", [])
for iid in nodeItemIds:
ids.add(str(iid))
subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
for child in subGroups:
_collectAllIds(child, ids)
def _removeGroupFromTree(nodes: list, groupId: str) -> list:
"""Remove a group node (and all descendants) from the tree by id."""
result = []
for node in nodes:
nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None)
if nodeId == groupId:
continue # skip this node (remove it)
subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
filtered_sub = _removeGroupFromTree(subGroups, groupId)
if isinstance(node, dict):
node = {**node, "subGroups": filtered_sub}
result.append(node)
return result
def handleGroupingInRequest(
paginationParams: Optional[PaginationParams],
interface,
contextKey: str,
) -> GroupingContext:
"""
Central grouping handler call at the start of every list route that
supports table grouping.
Steps (in order):
1. If paginationParams.saveGroupTree is set:
persist the new tree via interface.upsertTableGrouping, then clear
saveGroupTree from paginationParams so it is not treated as a filter.
2. Load the current group tree from the DB (used in step 3 and response).
3. If paginationParams.groupId is set:
resolve it to a Set[str] of itemIds (including all sub-groups),
then clear groupId from paginationParams so it is not treated as a
normal filter field.
4. Return a GroupingContext with groupTree (for the response) and itemIds
(for applyGroupScopeFilter).
The caller does NOT need to handle any grouping logic itself just call
applyGroupScopeFilter(items, groupCtx.itemIds) and embed groupCtx.groupTree
in the response dict.
"""
from modules.datamodels.datamodelPagination import TableGroupNode
groupTree = None
itemIds = None
if paginationParams is None:
try:
existing = interface.getTableGrouping(contextKey)
if existing:
groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups]
except Exception as e:
logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}")
return GroupingContext(groupTree=groupTree, itemIds=None)
# Step 1: persist saveGroupTree if present
if paginationParams.saveGroupTree is not None:
try:
saved = interface.upsertTableGrouping(contextKey, paginationParams.saveGroupTree)
groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in saved.rootGroups]
except Exception as e:
logger.error(f"handleGroupingInRequest: upsertTableGrouping failed: {e}")
paginationParams.saveGroupTree = None
# Step 2: load current tree (only if not already set from save above)
if groupTree is None:
try:
existing = interface.getTableGrouping(contextKey)
if existing:
groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups]
except Exception as e:
logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}")
# Step 3: resolve groupId to itemIds set
if paginationParams.groupId is not None:
targetGroupId = paginationParams.groupId
paginationParams.groupId = None # remove so it is not treated as a normal filter
if groupTree:
itemIds = _collectItemIds(groupTree, targetGroupId)
if itemIds is None:
logger.warning(
f"handleGroupingInRequest: groupId={targetGroupId!r} not found in tree "
f"for contextKey={contextKey!r} — returning empty set"
)
itemIds = set() # unknown group → show nothing rather than everything
else:
# groupId sent but no tree saved yet → return empty (nothing belongs to any group)
logger.warning(
f"handleGroupingInRequest: groupId={targetGroupId!r} set but no tree exists "
f"for contextKey={contextKey!r} — returning empty set"
)
itemIds = set()
return GroupingContext(groupTree=groupTree, itemIds=itemIds)
def applyGroupScopeFilter(items: List[Dict[str, Any]], itemIds: Optional[set]) -> List[Dict[str, Any]]:
"""
Filter items to those whose "id" field is in itemIds.
Returns items unchanged when itemIds is None (no active group scope).
Works for both normal list items and for mode=ids / mode=filterValues flows
call it before handleIdsInMemory / handleFilterValuesInMemory.
"""
if itemIds is None:
return items
return [item for item in items if str(item.get("id", "")) in itemIds]

View file

@ -241,6 +241,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
try:
from modules.shared.callbackRegistry import callbackRegistry
if connection.knowledgeIngestionEnabled:
callbackRegistry.trigger(
"connection.established",
connectionId=connection.id,
authority=str(getattr(connection.authority, "value", connection.authority) or "clickup"),
userId=str(user.id),
)
else:
logger.info(
"ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connection.id,
"authority": "clickup",
"reason": "consent_disabled",
},
)
except Exception as _cbErr:
logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
return HTMLResponse(
content=f"""
<html>

View file

@ -479,6 +479,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
try:
from modules.shared.callbackRegistry import callbackRegistry
if connection.knowledgeIngestionEnabled:
callbackRegistry.trigger(
"connection.established",
connectionId=connection.id,
authority=str(getattr(connection.authority, "value", connection.authority) or "google"),
userId=str(user.id),
)
else:
logger.info(
"ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connection.id,
"authority": "google",
"reason": "consent_disabled",
},
)
except Exception as _cbErr:
logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
return HTMLResponse(
content=f"""
<html>

View file

@ -420,6 +420,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
try:
from modules.shared.callbackRegistry import callbackRegistry
if connection.knowledgeIngestionEnabled:
callbackRegistry.trigger(
"connection.established",
connectionId=connection.id,
authority=str(getattr(connection.authority, "value", connection.authority) or "msft"),
userId=str(user.id),
)
else:
logger.info(
"ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connection.id,
"authority": "msft",
"reason": "consent_disabled",
},
)
except Exception as _cbErr:
logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
return HTMLResponse(
content=f"""
<html>

View file

@ -11,8 +11,6 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr
from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
_getOrCreateTempFolder,
_looksLikeBinary,
_resolveFileScope,
_MAX_TOOL_RESULT_CHARS,
)
@ -392,65 +390,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
if chunkMime:
mimeType = chunkMime
# 2) File not yet indexed -> trigger extraction via ExtractionService, then retry
if not imageData and knowledgeService and not knowledgeService.isFileIndexed(fileId):
try:
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId)
fileContent = chatService.getFileContent(fileId)
if fileContent and fileInfo:
rawData = fileContent.get("data", "")
if isinstance(rawData, str) and len(rawData) > 100:
rawBytes = _b64.b64decode(rawData)
elif isinstance(rawData, bytes):
rawBytes = rawData
else:
rawBytes = None
if rawBytes:
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions
fileMime = fileInfo.get("mimeType", "application/octet-stream")
fileName = fileInfo.get("fileName", fileId)
extracted = runExtraction(
ExtractorRegistry(), None,
rawBytes, fileName, fileMime, ExtractionOptions(),
)
contentObjects = []
for part in extracted.parts:
tg = (part.typeGroup or "").lower()
ct = "image" if tg == "image" else "text"
if not part.data or not part.data.strip():
continue
contentObjects.append({
"contentObjectId": part.id,
"contentType": ct,
"data": part.data,
"contextRef": {"containerPath": fileName, "location": part.label, **(part.metadata or {})},
})
if contentObjects:
_diFiId, _diMId = _resolveFileScope(fileId, context)
await knowledgeService.indexFile(
fileId=fileId, fileName=fileName, mimeType=fileMime,
userId=context.get("userId", ""), contentObjects=contentObjects,
featureInstanceId=_diFiId,
mandateId=_diMId,
)
chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
if pageIndex is not None:
imageChunks = [c for c in imageChunks if c.get("contextRef", {}).get("pageIndex") == pageIndex]
if imageChunks:
imageData = imageChunks[0].get("data", "")
except Exception as extractErr:
logger.warning(f"describeImage: on-demand extraction failed: {extractErr}")
# 3) Direct image file (not a container) - use raw file data
# 2) Direct image file (not a container) - use raw file data
if not imageData:
chatService = services.chat
fileContent = chatService.getFileContent(fileId)
@ -460,7 +400,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
imageData = fileContent.get("data", "")
mimeType = fileMimeType
# 4) PDF page rendering: render the requested page as an image via PyMuPDF
# 3) PDF page rendering: render the requested page as an image via PyMuPDF
if not imageData:
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId) if hasattr(chatService, "getFileInfo") else None

View file

@ -1,6 +1,6 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Shared helpers for core agent tools (file scope, binary detection, temp folder)."""
"""Shared helpers for core agent tools (file scope, binary detection, group helpers)."""
import logging
import uuid
@ -46,39 +46,60 @@ def _looksLikeBinary(data: bytes, sampleSize: int = 1024) -> bool:
return nonPrintable / len(sample) > 0.10
def _getOrCreateInstanceFolder(chatService, featureInstanceId: str, mandateId: str = "") -> Optional[str]:
"""Return the folder ID for a feature instance, creating it on first use.
Delegates to interfaceDbManagement._ensureFeatureInstanceFolder.
AI tools call this when saving a file without an explicit folderId
so that instance-produced files land in a named folder automatically.
"""
try:
dbMgmt = chatService.interfaceDbComponent
return dbMgmt._ensureFeatureInstanceFolder(featureInstanceId, mandateId)
except Exception as e:
logger.warning(f"Could not get/create instance folder for {featureInstanceId}: {e}")
return None
def _getOrCreateTempFolder(chatService) -> Optional[str]:
"""Return the ID of the root-level 'Temp' folder, creating it if it doesn't exist."""
"""Deprecated stub: folder-based organisation has been replaced by grouping.
Returns None unconditionally so callers skip the (now removed) folderId
assignment. Remove callers incrementally and delete this stub afterwards.
"""
logger.debug("_getOrCreateTempFolder called folder support removed, returning None")
return None
async def _getOrCreateInstanceGroup(
appInterface,
featureInstanceId: str,
contextKey: str = "files/list",
) -> Optional[str]:
"""Return groupId of the default group for a feature instance; create if needed."""
try:
allFolders = chatService.interfaceDbComponent.listFolders()
tempFolder = next(
(f for f in allFolders
if f.get("name") == "Temp" and not f.get("parentId")),
None,
)
if tempFolder:
return tempFolder.get("id")
newFolder = chatService.interfaceDbComponent.createFolder("Temp", parentId=None)
return newFolder.get("id") if newFolder else None
existing = appInterface.getTableGrouping(contextKey)
nodes = [
n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
for n in (existing.rootGroups if existing else [])
]
def _find(nds):
for nd in nds:
meta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {})
if (meta or {}).get("featureInstanceId") == featureInstanceId:
return nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
found = _find(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []))
if found:
return found
return None
found = _find(nodes)
if found:
return found
newId = str(uuid.uuid4())
nodes.append({"id": newId, "name": featureInstanceId, "itemIds": [], "subGroups": [], "meta": {"featureInstanceId": featureInstanceId}})
appInterface.upsertTableGrouping(contextKey, nodes)
return newId
except Exception as e:
logger.warning(f"Could not get/create Temp folder: {e}")
logger.error(f"_getOrCreateInstanceGroup: {e}")
return None
async def _getOrCreateTempGroup(
appInterface,
sessionId: str,
contextKey: str = "files/list",
) -> Optional[str]:
"""Return groupId of a temporary group for a session; create if needed."""
return await _getOrCreateInstanceGroup(appInterface, f"_temp_{sessionId}", contextKey)
def _attachFileAsChatDocument(
services: Any,
fileItem: Any,

View file

@ -11,10 +11,9 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr
from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
_attachFileAsChatDocument,
_formatToolFileResult,
_getOrCreateInstanceFolder,
_getOrCreateTempFolder,
_getOrCreateInstanceGroup,
_getOrCreateTempGroup,
_looksLikeBinary,
_resolveFileScope,
_MAX_TOOL_RESULT_CHARS,
)
@ -50,6 +49,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
return ToolResult(toolCallId="", toolName="readFile", success=False, error="fileId is required")
try:
knowledgeService = services.getService("knowledge") if hasattr(services, "getService") else None
fileStatus = None
# 1) Knowledge Store: return already-extracted text chunks
if knowledgeService:
@ -77,7 +77,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
data=f"[File {fileId} is currently being processed (status: {fileStatus}). Try again shortly.]",
)
# 2) Not indexed yet: try on-demand extraction
# 2) Not indexed yet: inspect file type to decide how to serve the agent
# (binary -> instruct agent to wait / re-upload; text -> decode raw bytes inline)
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId)
if not fileInfo:
@ -100,83 +101,14 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
isBinary = _looksLikeBinary(rawBytes)
if isBinary:
try:
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry, ChunkerRegistry
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions
extracted = runExtraction(
ExtractorRegistry(), ChunkerRegistry(),
rawBytes, fileName, mimeType, ExtractionOptions(),
)
contentObjects = []
for part in extracted.parts:
tg = (part.typeGroup or "").lower()
ct = "image" if tg == "image" else "text"
if not part.data or not part.data.strip():
continue
contentObjects.append({
"contentObjectId": part.id,
"contentType": ct,
"data": part.data,
"contextRef": {
"containerPath": fileName,
"location": part.label or "file",
**(part.metadata or {}),
},
})
if contentObjects:
if knowledgeService:
try:
userId = context.get("userId", "")
_fiId, _mId = _resolveFileScope(fileId, context)
await knowledgeService.indexFile(
fileId=fileId, fileName=fileName, mimeType=mimeType,
userId=userId, contentObjects=contentObjects,
featureInstanceId=_fiId,
mandateId=_mId,
)
except Exception as e:
logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}")
joined = ""
if knowledgeService:
_chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
_textChunks = [
c for c in (_chunks or [])
if c.get("contentType") != "image" and c.get("data")
]
if _textChunks:
joined = "\n\n".join(c["data"] for c in _textChunks)
if not joined:
textParts = [o["data"] for o in contentObjects if o["contentType"] != "image"]
joined = "\n\n".join(textParts) if textParts else ""
if joined:
chunked = _applyOffsetLimit(joined, offset, limit)
if chunked is not None:
return ToolResult(toolCallId="", toolName="readFile", success=True, data=chunked)
if len(joined) > _MAX_TOOL_RESULT_CHARS:
joined = joined[:_MAX_TOOL_RESULT_CHARS] + f"\n\n[Truncated showing first {_MAX_TOOL_RESULT_CHARS} chars of {len(joined)}. Use offset/limit to read specific sections.]"
return ToolResult(
toolCallId="", toolName="readFile", success=True,
data=joined,
)
imgCount = sum(1 for o in contentObjects if o["contentType"] == "image")
return ToolResult(
toolCallId="", toolName="readFile", success=True,
data=f"[Extracted {len(contentObjects)} content objects from '{fileName}' "
f"({imgCount} images, no readable text). "
f"Use describeImage(fileId='{fileId}') to analyze visual content.]",
)
except Exception as extractErr:
logger.warning(f"readFile extraction failed for {fileId} ({fileName}): {extractErr}")
return ToolResult(
toolCallId="", toolName="readFile", success=True,
data=f"[Binary file: '{fileName}', type={mimeType}, size={len(rawBytes)} bytes. "
f"Text extraction not available. Use describeImage for images.]",
data=(
f"[File '{fileName}' ({mimeType}) is not yet indexed "
f"(status: {fileStatus or 'unknown'}). Indexing runs automatically "
f"on upload. Please wait a few seconds and retry, or re-upload the file. "
f"For visual content use describeImage(fileId='{fileId}').]"
),
)
# 3) Text file: decode raw bytes
@ -237,7 +169,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
try:
chatService = services.chat
files = chatService.listFiles(
folderId=args.get("folderId"),
tags=args.get("tags"),
search=args.get("search"),
)
@ -290,18 +221,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="searchInFileContent", success=False, error=str(e))
async def _listFolders(args: Dict[str, Any], context: Dict[str, Any]):
try:
chatService = services.chat
folders = chatService.listFolders(parentId=args.get("parentId"))
folderList = "\n".join(
f"- {f.get('name', 'unnamed')} (id: {f.get('id', '?')})"
for f in folders
) if folders else "No folders found."
return ToolResult(toolCallId="", toolName="listFolders", success=True, data=folderList)
except Exception as e:
return ToolResult(toolCallId="", toolName="listFolders", success=False, error=str(e))
async def _webSearch(args: Dict[str, Any], context: Dict[str, Any]):
query = args.get("query", "")
if not query:
@ -339,35 +258,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="tagFile", success=False, error=str(e))
async def _moveFile(args: Dict[str, Any], context: Dict[str, Any]):
fileId = args.get("fileId", "")
targetFolderId = args.get("targetFolderId")
if not fileId:
return ToolResult(toolCallId="", toolName="moveFile", success=False, error="fileId is required")
try:
chatService = services.chat
chatService.interfaceDbComponent.updateFile(fileId, {"folderId": targetFolderId})
return ToolResult(
toolCallId="", toolName="moveFile", success=True,
data=f"File {fileId} moved to folder {targetFolderId or 'root'}"
)
except Exception as e:
return ToolResult(toolCallId="", toolName="moveFile", success=False, error=str(e))
async def _createFolder(args: Dict[str, Any], context: Dict[str, Any]):
name = args.get("name", "")
if not name:
return ToolResult(toolCallId="", toolName="createFolder", success=False, error="name is required")
try:
chatService = services.chat
folder = chatService.createFolder(name=name, parentId=args.get("parentId"))
return ToolResult(
toolCallId="", toolName="createFolder", success=True,
data=f"Folder '{name}' created (id: {folder.get('id', '?')})"
)
except Exception as e:
return ToolResult(toolCallId="", toolName="createFolder", success=False, error=str(e))
async def _writeFile(args: Dict[str, Any], context: Dict[str, Any]):
content = args.get("content", "")
mode = args.get("mode", "create")
@ -422,12 +312,52 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
fiId = context.get("featureInstanceId") or (services.featureInstanceId if services else "")
if fiId:
dbMgmt.updateFile(fileItem.id, {"featureInstanceId": fiId})
if args.get("folderId"):
dbMgmt.updateFile(fileItem.id, {"folderId": args["folderId"]})
if args.get("groupId"):
try:
appIface = chatService.interfaceDbApp
existing = appIface.getTableGrouping("files/list")
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
def _addToGroup(nds, gid, fid):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
if nid == gid:
ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
if fid not in ids:
ids.append(fid)
if isinstance(nd, dict):
nd["itemIds"] = ids
return True
if _addToGroup(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid):
return True
return False
_addToGroup(nodes, args["groupId"], fileItem.id)
appIface.upsertTableGrouping("files/list", nodes)
except Exception as _ge:
logger.warning(f"writeFile: failed to add file to group {args['groupId']}: {_ge}")
elif fiId:
instanceFolderId = _getOrCreateInstanceFolder(chatService, fiId, context.get("mandateId", ""))
if instanceFolderId:
dbMgmt.updateFile(fileItem.id, {"folderId": instanceFolderId})
try:
appIface = chatService.interfaceDbApp
instanceGroupId = await _getOrCreateInstanceGroup(appIface, fiId)
if instanceGroupId:
existing = appIface.getTableGrouping("files/list")
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
def _addToGroup2(nds, gid, fid):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
if nid == gid:
ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
if fid not in ids:
ids.append(fid)
if isinstance(nd, dict):
nd["itemIds"] = ids
return True
if _addToGroup2(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid):
return True
return False
_addToGroup2(nodes, instanceGroupId, fileItem.id)
appIface.upsertTableGrouping("files/list", nodes)
except Exception as _ge:
logger.warning(f"writeFile: failed to add file to instance group for {fiId}: {_ge}")
if args.get("tags"):
dbMgmt.updateFile(fileItem.id, {"tags": args["tags"]})
@ -480,13 +410,13 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
registry.register(
"listFiles", _listFiles,
description=(
"List files in the local workspace. Filter by folder, tags, or search term. "
"List files in the local workspace. Filter by tags or search term. "
"To filter by group, use listItemsInGroup. "
"For external data sources, use browseDataSource instead."
),
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "Filter by folder ID"},
"tags": {"type": "array", "items": {"type": "string"}, "description": "Filter by tags (any match)"},
"search": {"type": "string", "description": "Search in file names and descriptions"},
}
@ -513,18 +443,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=True
)
registry.register(
"listFolders", _listFolders,
description="List folders in the local workspace. For external data sources, use browseDataSource instead.",
parameters={
"type": "object",
"properties": {
"parentId": {"type": "string", "description": "Parent folder ID (omit for root)"},
}
},
readOnly=True
)
registry.register(
"webSearch", _webSearch,
description="Search the web for general information. Use readUrl to fetch content from a known URL instead.",
@ -550,34 +468,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=False
)
registry.register(
"moveFile", _moveFile,
description="Move a file to a different folder in the local workspace.",
parameters={
"type": "object",
"properties": {
"fileId": {"type": "string", "description": "The file ID to move"},
"targetFolderId": {"type": "string", "description": "Target folder ID (null for root)"},
},
"required": ["fileId"]
},
readOnly=False
)
registry.register(
"createFolder", _createFolder,
description="Create a new folder in the local workspace.",
parameters={
"type": "object",
"properties": {
"name": {"type": "string", "description": "Folder name"},
"parentId": {"type": "string", "description": "Parent folder ID (omit for root)"},
},
"required": ["name"]
},
readOnly=False
)
registry.register(
"writeFile", _writeFile,
description=(
@ -598,7 +488,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
"content": {"type": "string", "description": "Content to write/append"},
"mode": {"type": "string", "enum": ["create", "append", "overwrite"], "description": "Write mode (default: create)"},
"fileId": {"type": "string", "description": "File ID (required for mode=append/overwrite)"},
"folderId": {"type": "string", "description": "Target folder ID (mode=create only)"},
"groupId": {"type": "string", "description": "Group ID to place the file in (mode=create only). Omit to use the instance default group."},
"tags": {"type": "array", "items": {"type": "string"}, "description": "Tags (mode=create only)"},
},
"required": ["content"]
@ -758,55 +648,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=True
)
# ---- Phase 2: deleteFolder, renameFolder, moveFolder, copyFile, editFile ----
async def _deleteFolder(args: Dict[str, Any], context: Dict[str, Any]):
folderId = args.get("folderId", "")
recursive = args.get("recursive", False)
if not folderId:
return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error="folderId is required")
try:
chatService = services.chat
result = chatService.interfaceDbComponent.deleteFolder(folderId, recursive=recursive)
summary = f"Deleted {result.get('deletedFolders', 1)} folder(s) and {result.get('deletedFiles', 0)} file(s)"
return ToolResult(
toolCallId="", toolName="deleteFolder", success=True, data=summary,
sideEvents=[{"type": "folderDeleted", "data": {"folderId": folderId, **result}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error=str(e))
async def _renameFolder(args: Dict[str, Any], context: Dict[str, Any]):
folderId = args.get("folderId", "")
newName = args.get("newName", "")
if not folderId or not newName:
return ToolResult(toolCallId="", toolName="renameFolder", success=False, error="folderId and newName are required")
try:
chatService = services.chat
chatService.interfaceDbComponent.renameFolder(folderId, newName)
return ToolResult(
toolCallId="", toolName="renameFolder", success=True,
data=f"Folder {folderId} renamed to '{newName}'",
sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "name": newName}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="renameFolder", success=False, error=str(e))
async def _moveFolder(args: Dict[str, Any], context: Dict[str, Any]):
folderId = args.get("folderId", "")
targetParentId = args.get("targetParentId")
if not folderId:
return ToolResult(toolCallId="", toolName="moveFolder", success=False, error="folderId is required")
try:
chatService = services.chat
chatService.interfaceDbComponent.moveFolder(folderId, targetParentId)
return ToolResult(
toolCallId="", toolName="moveFolder", success=True,
data=f"Folder {folderId} moved to {targetParentId or 'root'}",
sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "parentId": targetParentId}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="moveFolder", success=False, error=str(e))
# ---- Phase 2: copyFile, editFile ----
async def _copyFile(args: Dict[str, Any], context: Dict[str, Any]):
fileId = args.get("fileId", "")
@ -816,7 +658,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
chatService = services.chat
copiedFile = chatService.interfaceDbComponent.copyFile(
fileId,
targetFolderId=args.get("targetFolderId"),
newFileName=args.get("newFileName"),
)
return ToolResult(
@ -891,48 +732,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="replaceInFile", success=False, error=str(e))
registry.register(
"deleteFolder", _deleteFolder,
description="Delete a folder from the local workspace. Set recursive=true to delete all contents.",
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "The folder ID to delete"},
"recursive": {"type": "boolean", "description": "If true, delete folder and all contents (files and subfolders). Default: false"},
},
"required": ["folderId"]
},
readOnly=False
)
registry.register(
"renameFolder", _renameFolder,
description="Rename a folder in the local workspace.",
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "The folder ID to rename"},
"newName": {"type": "string", "description": "New folder name"},
},
"required": ["folderId", "newName"]
},
readOnly=False
)
registry.register(
"moveFolder", _moveFolder,
description="Move a folder to a different parent in the local workspace.",
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "The folder ID to move"},
"targetParentId": {"type": "string", "description": "Target parent folder ID (null/omit for root)"},
},
"required": ["folderId"]
},
readOnly=False
)
registry.register(
"copyFile", _copyFile,
description="Create an independent copy of a file in the local workspace.",
@ -940,7 +739,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
"type": "object",
"properties": {
"fileId": {"type": "string", "description": "The file ID to copy"},
"targetFolderId": {"type": "string", "description": "Target folder for the copy (default: same folder)"},
"newFileName": {"type": "string", "description": "New file name (default: same name, auto-numbered if duplicate)"},
},
"required": ["fileId"]
@ -948,6 +746,137 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=False
)
# ---- Group tools (replaces folder-based tools) ----
async def _listGroups(args: Dict[str, Any], context: Dict[str, Any]):
contextKey = args.get("contextKey", "files/list")
try:
chatService = services.chat
appInterface = chatService.interfaceDbApp
existing = appInterface.getTableGrouping(contextKey)
if not existing:
return ToolResult(toolCallId="", toolName="listGroups", success=True, data="No groups found.")
def _flatten(nodes, depth=0):
result = []
for n in nodes:
nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))})
result.extend(_flatten(nd.get("subGroups", []), depth + 1))
return result
groups = _flatten(existing.rootGroups)
lines = "\n".join(
f"{' ' * g['depth']}- {g['name']} (id: {g['id']}, items: {g['itemCount']})"
for g in groups
) if groups else "No groups found."
return ToolResult(toolCallId="", toolName="listGroups", success=True, data=lines)
except Exception as e:
return ToolResult(toolCallId="", toolName="listGroups", success=False, error=str(e))
async def _listItemsInGroup(args: Dict[str, Any], context: Dict[str, Any]):
groupId = args.get("groupId", "")
contextKey = args.get("contextKey", "files/list")
if not groupId:
return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error="groupId is required")
try:
from modules.routes.routeHelpers import _collectItemIds
chatService = services.chat
appInterface = chatService.interfaceDbApp
existing = appInterface.getTableGrouping(contextKey)
if not existing:
return ToolResult(toolCallId="", toolName="listItemsInGroup", success=True, data="No groups found.")
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups]
ids = _collectItemIds(nodes, groupId)
itemList = list(ids) if ids else []
return ToolResult(
toolCallId="", toolName="listItemsInGroup", success=True,
data="\n".join(f"- {fid}" for fid in itemList) if itemList else "No items in group.",
)
except Exception as e:
return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error=str(e))
async def _addItemsToGroup(args: Dict[str, Any], context: Dict[str, Any]):
groupId = args.get("groupId", "")
itemIds = args.get("itemIds", [])
contextKey = args.get("contextKey", "files/list")
if not groupId:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="groupId is required")
if not itemIds:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="itemIds is required")
try:
chatService = services.chat
appInterface = chatService.interfaceDbApp
existing = appInterface.getTableGrouping(contextKey)
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
def _add(nds):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
if nid == groupId:
existing_ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
for fid in itemIds:
if fid not in existing_ids:
existing_ids.append(fid)
if isinstance(nd, dict):
nd["itemIds"] = existing_ids
return True
if _add(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])):
return True
return False
found = _add(nodes)
if not found:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=f"Group {groupId} not found")
appInterface.upsertTableGrouping(contextKey, nodes)
return ToolResult(
toolCallId="", toolName="addItemsToGroup", success=True,
data=f"Added {len(itemIds)} item(s) to group {groupId}",
)
except Exception as e:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=str(e))
registry.register(
"listGroups", _listGroups,
description="List all groups in the file grouping tree. Groups replace folders for organising files.",
parameters={
"type": "object",
"properties": {
"contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
}
},
readOnly=True
)
registry.register(
"listItemsInGroup", _listItemsInGroup,
description="List all file IDs assigned to a specific group (includes sub-groups recursively).",
parameters={
"type": "object",
"properties": {
"groupId": {"type": "string", "description": "The group ID to inspect"},
"contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
},
"required": ["groupId"]
},
readOnly=True
)
registry.register(
"addItemsToGroup", _addItemsToGroup,
description="Add one or more file IDs to an existing group.",
parameters={
"type": "object",
"properties": {
"groupId": {"type": "string", "description": "The group ID to add files to"},
"itemIds": {"type": "array", "items": {"type": "string"}, "description": "List of file IDs to add"},
"contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
},
"required": ["groupId", "itemIds"]
},
readOnly=False
)
registry.register(
"replaceInFile", _replaceInFile,
description=(

View file

@ -268,24 +268,19 @@ class AgentService:
info = chatService.getFileInfo(fid)
if not info:
folderInfo = chatService.interfaceDbComponent.getFolder(fid)
if folderInfo:
folderName = folderInfo.get("name", fid)
folderFiles = chatService.listFiles(folderId=fid)
desc = f"### Folder: {folderName}\n - id: {fid}\n - type: folder\n - contains: {len(folderFiles)} file(s)"
if folderFiles:
desc += "\n - files:"
for ff in folderFiles[:30]:
ffName = ff.get("fileName", "?")
ffId = ff.get("id", "?")
ffMime = ff.get("mimeType", "?")
ffSize = ff.get("fileSize", ff.get("size", "?"))
desc += f"\n * {ffName} (id: {ffId}, type: {ffMime}, size: {ffSize} bytes)"
if len(folderFiles) > 30:
desc += f"\n ... and {len(folderFiles) - 30} more files"
desc += f'\nUse `listFiles(folderId="{fid}")` to get the full file list, then `readFile(fileId)` to read individual files.'
fileDescriptions.append(desc)
continue
# Check if fid is a group ID
try:
groupFileIds = chatService.listFilesInGroup(fid)
if groupFileIds:
allGroups = chatService.listGroups()
groupInfo = next((g for g in allGroups if g.get("id") == fid), None)
groupName = groupInfo.get("name", fid) if groupInfo else fid
desc = f"### Group: {groupName}\n - id: {fid}\n - type: group\n - contains: {len(groupFileIds)} file(s)"
desc += f'\nUse `listItemsInGroup(groupId="{fid}")` to get file IDs, then `readFile(fileId)` to read each.'
fileDescriptions.append(desc)
continue
except Exception:
pass
fileDescriptions.append(f"### File id: {fid}")
continue
@ -333,7 +328,7 @@ class AgentService:
"These files/folders have been uploaded and processed through the extraction pipeline.\n"
"Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, "
"or `describeImage(fileId)` for image analysis.\n"
"For folders, use `listFiles(folderId)` to get the files inside, then `readFile(fileId)` for each.\n"
"For groups, use `listItemsInGroup(groupId)` to get the file IDs inside, then `readFile(fileId)` for each.\n"
"For large PDFs/DOCX, avoid huge `renderDocument` tool JSON: build markdown with "
"`writeFile` (create + append), then `renderDocument(sourceFileId=that file id, outputFormat=...)`.\n"
"For small docs you may pass `content` inline. Embed images with `![alt](file:fileId)` in markdown.\n\n"

View file

@ -169,11 +169,28 @@ class AiService:
if request.options and request.options.operationType == OperationTypeEnum.SPEECH_TEAMS:
return await self._handleSpeechTeams(request)
# FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check)
self._preflightBillingCheck()
_opType = request.options.operationType if request.options else None
_isNeutralizationCall = _opType in (
OperationTypeEnum.NEUTRALIZATION_TEXT,
OperationTypeEnum.NEUTRALIZATION_IMAGE,
)
# Balance & provider permission checks
await self._checkBillingBeforeAiCall()
if not _isNeutralizationCall:
# FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check)
self._preflightBillingCheck()
# Balance & provider permission checks
await self._checkBillingBeforeAiCall()
else:
# Neutralization calls are system-level operations (connector anonymization).
# They run without a mandate context (e.g. personal-scope connections) and
# are billed the same way as embedding calls: best-effort, skipped when no
# billing settings exist for an empty mandate.
logger.debug(
"callAi: skipping billing preflight for neutralization call "
"(operationType=%s, user=%s)",
_opType,
getattr(getattr(self.services, 'user', None), 'id', 'unknown'),
)
# Calculate effective allowedProviders: RBAC ∩ Workflow
effectiveProviders = self._calculateEffectiveProviders()
@ -227,8 +244,15 @@ class AiService:
Rehydration happens on the final AiCallResponse (not on individual str deltas).
"""
await self.ensureAiObjectsInitialized()
self._preflightBillingCheck()
await self._checkBillingBeforeAiCall()
_streamOpType = request.options.operationType if request.options else None
_isNeutralizationStream = _streamOpType in (
OperationTypeEnum.NEUTRALIZATION_TEXT,
OperationTypeEnum.NEUTRALIZATION_IMAGE,
)
if not _isNeutralizationStream:
self._preflightBillingCheck()
await self._checkBillingBeforeAiCall()
effectiveProviders = self._calculateEffectiveProviders()
if effectiveProviders and request.options:

View file

@ -413,7 +413,7 @@ class ChatService:
return None
def getFileInfo(self, fileId: str) -> Dict[str, Any]:
"""Get file information including new fields (tags, folderId, description, status)."""
"""Get file information including new fields (tags, description, status)."""
fileItem = self.interfaceDbComponent.getFile(fileId)
if fileItem:
return {
@ -424,7 +424,6 @@ class ChatService:
"fileHash": fileItem.fileHash,
"creationDate": fileItem.sysCreatedAt,
"tags": getattr(fileItem, "tags", None),
"folderId": getattr(fileItem, "folderId", None),
"description": getattr(fileItem, "description", None),
"status": getattr(fileItem, "status", None),
}
@ -443,14 +442,12 @@ class ChatService:
def listFiles(
self,
folderId: str = None,
tags: List[str] = None,
search: str = None,
) -> List[Dict[str, Any]]:
"""List files for the current user with optional filters.
Args:
folderId: Filter by folder (None = root / all).
tags: Filter by tags (any match).
search: Search in fileName and description.
@ -463,10 +460,6 @@ class ChatService:
allFiles = self.interfaceDbComponent.getAllFiles()
results = []
for fileItem in allFiles:
if folderId is not None:
if fileItem.get("folderId") != folderId:
continue
if tags:
itemTags = fileItem.get("tags") or []
if not any(t in itemTags for t in tags):
@ -486,27 +479,40 @@ class ChatService:
"fileSize": fileItem.get("fileSize"),
"creationDate": fileItem.get("sysCreatedAt"),
"tags": fileItem.get("tags"),
"folderId": fileItem.get("folderId"),
"description": fileItem.get("description"),
"status": fileItem.get("status"),
})
return results
def listFolders(self, parentId: str = None) -> List[Dict[str, Any]]:
"""List file folders for the current user.
def listGroups(self, contextKey: str = "files/list") -> list:
"""List all groups in the groupTree for the current context."""
try:
existing = self.interfaceDbApp.getTableGrouping(contextKey)
if not existing:
return []
def _flatten(nodes, depth=0):
result = []
for n in nodes:
nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))})
result.extend(_flatten(nd.get("subGroups", []), depth + 1))
return result
return _flatten(existing.rootGroups)
except Exception as e:
return []
Args:
parentId: Optional parent folder ID to filter by.
None = return ALL folders (for tree building).
Returns:
List of folder dicts.
"""
return self.interfaceDbComponent.listFolders(parentId=parentId)
def createFolder(self, name: str, parentId: str = None) -> Dict[str, Any]:
"""Create a new file folder with unique name validation."""
return self.interfaceDbComponent.createFolder(name=name, parentId=parentId)
def listFilesInGroup(self, groupId: str, contextKey: str = "files/list") -> list:
"""List file IDs in a specific group (recursive)."""
try:
from modules.routes.routeHelpers import _collectItemIds
existing = self.interfaceDbApp.getTableGrouping(contextKey)
if not existing:
return []
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups]
ids = _collectItemIds(nodes, groupId)
return list(ids) if ids else []
except Exception:
return []
# ---- DataSource CRUD ----

View file

@ -2,9 +2,13 @@
# All rights reserved.
"""Knowledge service: 3-tier RAG with indexing, semantic search, and context building."""
import hashlib
import json
import logging
import re
from typing import Any, Callable, Dict, List, Optional
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional, Union
from modules.datamodels.datamodelKnowledge import (
FileContentIndex, ContentChunk, WorkflowMemory,
@ -20,6 +24,68 @@ DEFAULT_CHUNK_TOKENS = 400
DEFAULT_CONTEXT_BUDGET = 12000
# =============================================================================
# Ingestion façade (P0 of unified-knowledge-indexing concept)
# =============================================================================
@dataclass
class IngestionJob:
"""One request to add or refresh content in the unified knowledge store.
Callers from any lane (routes, feature hooks, agent tools, connector sync)
describe the work they want done via this object; idempotency, scope
resolution, and embedding are handled by KnowledgeService.requestIngestion.
"""
sourceKind: str
sourceId: str
fileName: str
mimeType: str
userId: str
contentObjects: List[Dict[str, Any]] = field(default_factory=list)
featureInstanceId: str = ""
mandateId: str = ""
structure: Optional[Dict[str, Any]] = None
containerPath: Optional[str] = None
contentVersion: Optional[str] = None
provenance: Optional[Dict[str, Any]] = None
# Connector-driven neutralization: True when the user opted in via §2.6 preferences.
# For sourceKind == "file", _indexFileInternal resolves this from FileItem.neutralize instead.
neutralize: bool = False
@dataclass
class IngestionHandle:
"""Result of requestIngestion. Stable across in-process and future queue impls."""
jobId: str
status: str
contentHash: str
fileId: str
index: Optional[FileContentIndex] = None
error: Optional[str] = None
def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str:
"""Deterministic SHA256 over (contentType, data) tuples in extractor order.
`contentObjectId` is intentionally excluded because extractors generate
fresh UUIDs per run (`uuid.uuid4()`), which would make the hash unstable
across re-extractions of the same source defeating idempotency.
Order is preserved (no sort) because two different documents can share the
same multiset of parts but differ in arrangement (e.g. swapped pages).
Text whitespace is preserved intentionally because chunk boundaries
depend on it.
"""
normalized = [
(
str(o.get("contentType", "text") or "text"),
o.get("data", "") or "",
)
for o in (contentObjects or [])
]
payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":"))
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
class KnowledgeService:
"""Service for Knowledge Store operations: indexing, retrieval, and context building."""
@ -46,6 +112,224 @@ class KnowledgeService:
results = await self._embed([text])
return results[0] if results else []
# =========================================================================
# Ingestion façade (single entry point for all lanes)
# =========================================================================
async def requestIngestion(self, job: IngestionJob) -> IngestionHandle:
"""Unified entry point for filling the knowledge corpus.
Applies idempotency based on a content hash (or caller-supplied
`contentVersion`) persisted in `FileContentIndex.structure._ingestion`.
Re-runs indexing only when the hash differs or the previous run did
not reach `indexed` state. Runs embedding synchronously for now
(callers already schedule background tasks where needed).
"""
jobId = f"{job.sourceKind}:{job.sourceId}"
startMs = time.time()
contentHash = job.contentVersion or _computeIngestionHash(job.contentObjects)
# 1. Check for duplicate via existing FileContentIndex row.
existing = None
try:
existing = self._knowledgeDb.getFileContentIndex(job.sourceId)
except Exception:
existing = None
if existing:
existingStructure = (
existing.get("structure") if isinstance(existing, dict)
else getattr(existing, "structure", {})
) or {}
existingMeta = existingStructure.get("_ingestion", {}) or {}
existingStatus = (
existing.get("status") if isinstance(existing, dict)
else getattr(existing, "status", "")
) or ""
if existingMeta.get("hash") == contentHash and existingStatus == "indexed":
logger.info(
"ingestion.skipped.duplicate sourceKind=%s sourceId=%s hash=%s",
job.sourceKind, job.sourceId, contentHash[:12],
extra={
"event": "ingestion.skipped.duplicate",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"durationMs": int((time.time() - startMs) * 1000),
},
)
return IngestionHandle(
jobId=jobId,
status="duplicate",
contentHash=contentHash,
fileId=job.sourceId,
index=None,
)
# 2. Prepare ingestion metadata; stays in structure._ingestion so
# later connector revoke/purge can filter chunks by sourceKind /
# provenance.connectionId without a schema migration.
ingestionMeta = {
"hash": contentHash,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"contentVersion": job.contentVersion,
"indexedAt": getUtcTimestamp(),
"provenance": dict(job.provenance or {}),
}
structure = dict(job.structure or {})
structure["_ingestion"] = ingestionMeta
logger.info(
"ingestion.queued sourceKind=%s sourceId=%s objects=%d hash=%s",
job.sourceKind, job.sourceId, len(job.contentObjects or []), contentHash[:12],
extra={
"event": "ingestion.queued",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"objectCount": len(job.contentObjects or []),
},
)
# 3. Run real indexing.
try:
index = await self._indexFileInternal(
fileId=job.sourceId,
fileName=job.fileName,
mimeType=job.mimeType,
userId=job.userId,
featureInstanceId=job.featureInstanceId,
mandateId=job.mandateId,
contentObjects=job.contentObjects or [],
structure=structure,
containerPath=job.containerPath,
sourceKind=job.sourceKind,
connectionId=(job.provenance or {}).get("connectionId"),
neutralize=job.neutralize,
)
except Exception as exc:
logger.error(
"ingestion.failed sourceKind=%s sourceId=%s error=%s",
job.sourceKind, job.sourceId, exc,
exc_info=True,
extra={
"event": "ingestion.failed",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"error": str(exc),
"durationMs": int((time.time() - startMs) * 1000),
},
)
try:
self._knowledgeDb.updateFileStatus(job.sourceId, "failed")
except Exception:
pass
return IngestionHandle(
jobId=jobId,
status="failed",
contentHash=contentHash,
fileId=job.sourceId,
index=None,
error=str(exc),
)
logger.info(
"ingestion.indexed sourceKind=%s sourceId=%s objects=%d durationMs=%d",
job.sourceKind, job.sourceId, len(job.contentObjects or []),
int((time.time() - startMs) * 1000),
extra={
"event": "ingestion.indexed",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"objectCount": len(job.contentObjects or []),
"durationMs": int((time.time() - startMs) * 1000),
},
)
return IngestionHandle(
jobId=jobId,
status="indexed",
contentHash=contentHash,
fileId=job.sourceId,
index=index,
)
def purgeConnection(self, connectionId: str) -> Dict[str, int]:
"""Delete every FileContentIndex + ContentChunk linked to a UserConnection.
Called on `connection.revoked` events so the knowledge corpus never
holds chunks the user has withdrawn access to. Returns deletion counts
for observability.
"""
if not connectionId:
return {"indexRows": 0, "chunks": 0}
startMs = time.time()
result = self._knowledgeDb.deleteFileContentIndexByConnectionId(connectionId)
logger.info(
"ingestion.connection.purged connectionId=%s rows=%d chunks=%d durationMs=%d",
connectionId, result["indexRows"], result["chunks"],
int((time.time() - startMs) * 1000),
extra={
"event": "ingestion.connection.purged",
"connectionId": connectionId,
"indexRows": result["indexRows"],
"chunks": result["chunks"],
"durationMs": int((time.time() - startMs) * 1000),
},
)
return result
def getIngestionStatus(
self, handleOrJobId: Union[IngestionHandle, str]
) -> Dict[str, Any]:
"""Map a handle or `sourceKind:sourceId` jobId to a status snapshot."""
if isinstance(handleOrJobId, IngestionHandle):
sourceId = handleOrJobId.fileId
jobId = handleOrJobId.jobId
elif isinstance(handleOrJobId, str) and ":" in handleOrJobId:
jobId = handleOrJobId
sourceId = handleOrJobId.split(":", 1)[1]
else:
jobId = str(handleOrJobId)
sourceId = str(handleOrJobId)
row = None
try:
row = self._knowledgeDb.getFileContentIndex(sourceId)
except Exception:
row = None
if not row:
return {
"jobId": jobId,
"sourceId": sourceId,
"status": "unknown",
"contentHash": None,
}
structure = (
row.get("structure") if isinstance(row, dict)
else getattr(row, "structure", {})
) or {}
meta = structure.get("_ingestion", {}) or {}
status = (
row.get("status") if isinstance(row, dict)
else getattr(row, "status", "")
) or "unknown"
return {
"jobId": jobId,
"sourceId": sourceId,
"status": status,
"contentHash": meta.get("hash"),
"sourceKind": meta.get("sourceKind"),
"indexedAt": meta.get("indexedAt"),
}
# =========================================================================
# File Indexing (called after extraction, before embedding)
# =========================================================================
@ -61,6 +345,57 @@ class KnowledgeService:
contentObjects: List[Dict[str, Any]] = None,
structure: Dict[str, Any] = None,
containerPath: str = None,
) -> Optional[FileContentIndex]:
"""Backward-compatible wrapper delegating to requestIngestion.
Existing callers that still invoke `indexFile` directly automatically
participate in the idempotency/metrics layer. New callers should
prefer `requestIngestion` so they can pass `sourceKind` and
`provenance` for connector revoke/purge later.
"""
job = IngestionJob(
sourceKind="file",
sourceId=fileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
contentObjects=list(contentObjects or []),
structure=structure,
containerPath=containerPath,
)
handle = await self.requestIngestion(job)
if handle.index is not None:
return handle.index
if handle.status == "duplicate":
row = None
try:
row = self._knowledgeDb.getFileContentIndex(fileId)
except Exception:
row = None
if isinstance(row, dict):
try:
return FileContentIndex(**row)
except Exception:
return None
return row
return None
async def _indexFileInternal(
self,
fileId: str,
fileName: str,
mimeType: str,
userId: str,
featureInstanceId: str = "",
mandateId: str = "",
contentObjects: List[Dict[str, Any]] = None,
structure: Dict[str, Any] = None,
containerPath: str = None,
sourceKind: str = "file",
connectionId: Optional[str] = None,
neutralize: bool = False,
) -> FileContentIndex:
"""Index a file's content objects and create embeddings for text chunks.
@ -83,39 +418,41 @@ class KnowledgeService:
"""
contentObjects = contentObjects or []
# 1. Resolve scope fields from FileItem (Single Source of Truth)
# FileItem lives in poweron_management; its scope/mandateId/featureInstanceId
# are authoritative and must be mirrored onto the FileContentIndex.
# 1. Resolve scope fields from FileItem (Single Source of Truth) for
# uploaded files. Connector-sourced ingestion (sharepoint_item,
# outlook_message, ...) has no FileItem row — trust the caller's
# scope + ids directly.
resolvedScope = "personal"
resolvedMandateId = mandateId
resolvedFeatureInstanceId = featureInstanceId
resolvedUserId = userId
_shouldNeutralize = False
try:
from modules.datamodels.datamodelFiles import FileItem as _FileItem
_dbComponent = getattr(self._context, "interfaceDbComponent", None)
_fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else []
if not _fileRecords:
from modules.interfaces.interfaceDbManagement import ComponentObjects
_row = ComponentObjects().db._loadRecord(_FileItem, fileId)
if _row:
_fileRecords = [_row]
if _fileRecords:
_fileRecord = _fileRecords[0]
_get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d))
_shouldNeutralize = bool(_get("neutralize", False))
_fileScope = _get("scope")
if _fileScope:
resolvedScope = _fileScope
if not resolvedMandateId:
resolvedMandateId = str(_get("mandateId", "") or "")
if not resolvedFeatureInstanceId:
resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "")
_fileCreatedBy = _get("sysCreatedBy")
if _fileCreatedBy:
resolvedUserId = str(_fileCreatedBy)
except Exception:
pass
_shouldNeutralize = neutralize # caller-supplied flag (connector prefs / IngestionJob)
if sourceKind == "file":
try:
from modules.datamodels.datamodelFiles import FileItem as _FileItem
_dbComponent = getattr(self._context, "interfaceDbComponent", None)
_fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else []
if not _fileRecords:
from modules.interfaces.interfaceDbManagement import ComponentObjects
_row = ComponentObjects().db._loadRecord(_FileItem, fileId)
if _row:
_fileRecords = [_row]
if _fileRecords:
_fileRecord = _fileRecords[0]
_get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d))
_shouldNeutralize = bool(_get("neutralize", False)) # FileItem is authoritative for uploads
_fileScope = _get("scope")
if _fileScope:
resolvedScope = _fileScope
if not resolvedMandateId:
resolvedMandateId = str(_get("mandateId", "") or "")
if not resolvedFeatureInstanceId:
resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "")
_fileCreatedBy = _get("sysCreatedBy")
if _fileCreatedBy:
resolvedUserId = str(_fileCreatedBy)
except Exception:
pass
# 2. Create FileContentIndex with correct scope from the start
index = FileContentIndex(
@ -124,6 +461,8 @@ class KnowledgeService:
featureInstanceId=resolvedFeatureInstanceId,
mandateId=resolvedMandateId,
scope=resolvedScope,
sourceKind=sourceKind,
connectionId=connectionId,
fileName=fileName,
mimeType=mimeType,
containerPath=containerPath,
@ -300,7 +639,12 @@ class KnowledgeService:
Formatted context string for injection into the agent's system prompt.
"""
queryVector = await self._embedSingle(currentPrompt)
logger.debug(
"buildAgentContext.start userId=%s featureInstanceId=%s mandateId=%s isSysAdmin=%s prompt=%r",
userId, featureInstanceId, mandateId, isSysAdmin, (currentPrompt or "")[:120],
)
if not queryVector:
logger.debug("buildAgentContext.abort reason=no_query_vector")
return ""
builder = _ContextBuilder(budget=contextBudget)
@ -327,9 +671,14 @@ class KnowledgeService:
featureInstanceId=featureInstanceId,
mandateId=mandateId,
limit=15,
minScore=0.65,
minScore=0.35,
isSysAdmin=isSysAdmin,
)
logger.debug(
"buildAgentContext.layer1 instanceChunks=%d top_scores=%s",
len(instanceChunks),
[round(float(c.get("_score", 0) or 0), 3) for c in (instanceChunks or [])[:3]],
)
if instanceChunks:
builder.add(priority=1, label="Relevant Documents", items=instanceChunks, maxChars=4000)
@ -338,7 +687,7 @@ class KnowledgeService:
queryVector=queryVector,
workflowId=workflowId,
limit=10,
minScore=0.55,
minScore=0.35,
)
if roundMemories:
memItems = []
@ -376,7 +725,7 @@ class KnowledgeService:
scope="mandate",
mandateId=mandateId,
limit=10,
minScore=0.7,
minScore=0.35,
isSysAdmin=isSysAdmin,
)
if mandateChunks:
@ -392,7 +741,12 @@ class KnowledgeService:
maxChars=500,
)
return builder.build()
_result = builder.build()
logger.debug(
"buildAgentContext.done totalChars=%d userId=%s",
len(_result), userId,
)
return _result
# =========================================================================
# Workflow Memory

View file

@ -0,0 +1,334 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Connection-lifecycle consumer bridging OAuth events to ingestion jobs.
Subscribes to `connection.established` and `connection.revoked` callbacks
emitted by the OAuth callbacks / connection management routes and dispatches:
- `connection.established` -> enqueue a `connection.bootstrap` BackgroundJob
that walks the connector and ingests all reachable items via
KnowledgeService.requestIngestion (file-like or virtual documents).
- `connection.revoked` -> run `KnowledgeService.purgeConnection` synchronously
so the knowledge corpus releases the data before the UI confirms the revoke.
The consumer is registered once at process boot (see `app.py` lifespan).
It intentionally does NOT hold a per-user service context; each callback
creates whatever context it needs from the UserConnection row itself.
"""
from __future__ import annotations
import asyncio
import logging
from typing import Any, Dict, Optional
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
from modules.shared.callbackRegistry import callbackRegistry
from modules.serviceCenter.services.serviceBackgroundJobs import (
registerJobHandler,
startJob,
)
logger = logging.getLogger(__name__)
BOOTSTRAP_JOB_TYPE = "connection.bootstrap"
_registered = False
def _onConnectionEstablished(
*,
connectionId: str,
authority: str,
userId: Optional[str] = None,
**kwargs: Any,
) -> None:
"""Fire-and-forget bootstrap enqueue for a freshly connected UserConnection."""
if not connectionId:
logger.warning("connection.established without connectionId; ignoring")
return
payload: Dict[str, Any] = {
"connectionId": connectionId,
"authority": (authority or "").lower(),
"userId": userId,
}
logger.info(
"ingestion.connection.bootstrap.queued connectionId=%s authority=%s",
connectionId, authority,
extra={
"event": "ingestion.connection.bootstrap.queued",
"connectionId": connectionId,
"authority": authority,
},
)
async def _enqueue() -> None:
try:
await startJob(
BOOTSTRAP_JOB_TYPE,
payload,
triggeredBy=userId,
)
except Exception as exc:
logger.error(
"ingestion.connection.bootstrap.enqueue_failed connectionId=%s error=%s",
connectionId, exc, exc_info=True,
)
try:
loop = asyncio.get_event_loop()
if loop.is_running():
loop.create_task(_enqueue())
else:
loop.run_until_complete(_enqueue())
except RuntimeError:
asyncio.run(_enqueue())
def _onConnectionRevoked(
*,
connectionId: str,
authority: Optional[str] = None,
userId: Optional[str] = None,
reason: Optional[str] = None,
**kwargs: Any,
) -> None:
"""Run the knowledge purge synchronously so UI feedback is authoritative."""
if not connectionId:
logger.warning("connection.revoked without connectionId; ignoring")
return
try:
# Purge lives on the DB interface to avoid ServiceCenter/user-context
# plumbing here; the service method is a thin wrapper on top of this.
result = getKnowledgeInterface(None).deleteFileContentIndexByConnectionId(connectionId)
except Exception as exc:
logger.error(
"ingestion.connection.purged.failed connectionId=%s error=%s",
connectionId, exc, exc_info=True,
)
return
logger.info(
"ingestion.connection.purged connectionId=%s authority=%s reason=%s rows=%d chunks=%d",
connectionId, authority, reason,
result.get("indexRows", 0), result.get("chunks", 0),
extra={
"event": "ingestion.connection.purged",
"connectionId": connectionId,
"authority": authority,
"reason": reason,
"indexRows": result.get("indexRows", 0),
"chunks": result.get("chunks", 0),
},
)
async def _bootstrapJobHandler(
job: Dict[str, Any],
progressCb,
) -> Dict[str, Any]:
"""Dispatch bootstrap by authority. Each authority runs its own sub-bootstraps."""
payload = job.get("payload") or {}
connectionId = payload.get("connectionId")
authority = (payload.get("authority") or "").lower()
if not connectionId:
raise ValueError("connection.bootstrap requires payload.connectionId")
progressCb(5, f"resolving {authority} connection")
# Defensive consent check: if the connection has since disabled knowledge ingestion
# (e.g. user toggled setting after the job was enqueued), skip all walkers.
try:
from modules.interfaces.interfaceDbApp import getRootInterface
_root = getRootInterface()
_conn = _root.getUserConnectionById(connectionId)
if _conn and not getattr(_conn, "knowledgeIngestionEnabled", True):
logger.info(
"ingestion.connection.bootstrap.skipped — consent disabled connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connectionId,
"authority": authority,
"reason": "consent_disabled",
},
)
return {"connectionId": connectionId, "authority": authority, "skipped": True, "reason": "consent_disabled"}
except Exception as _guardErr:
logger.debug("Could not load connection for consent guard: %s", _guardErr)
def _normalize(res: Any, label: str) -> Dict[str, Any]:
if isinstance(res, Exception):
logger.error(
"ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s",
label, connectionId, res, exc_info=res,
)
return {"error": str(res)}
return res or {}
if authority == "msft":
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
bootstrapSharepoint,
)
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import (
bootstrapOutlook,
)
progressCb(10, "sharepoint + outlook")
spResult, olResult = await asyncio.gather(
bootstrapSharepoint(connectionId=connectionId, progressCb=progressCb),
bootstrapOutlook(connectionId=connectionId, progressCb=progressCb),
return_exceptions=True,
)
return {
"connectionId": connectionId,
"authority": authority,
"sharepoint": _normalize(spResult, "sharepoint"),
"outlook": _normalize(olResult, "outlook"),
}
if authority == "google":
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import (
bootstrapGdrive,
)
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
bootstrapGmail,
)
progressCb(10, "drive + gmail")
gdResult, gmResult = await asyncio.gather(
bootstrapGdrive(connectionId=connectionId, progressCb=progressCb),
bootstrapGmail(connectionId=connectionId, progressCb=progressCb),
return_exceptions=True,
)
return {
"connectionId": connectionId,
"authority": authority,
"drive": _normalize(gdResult, "gdrive"),
"gmail": _normalize(gmResult, "gmail"),
}
if authority == "clickup":
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
bootstrapClickup,
)
progressCb(10, "clickup tasks")
cuResult = await bootstrapClickup(connectionId=connectionId, progressCb=progressCb)
return {
"connectionId": connectionId,
"authority": authority,
"clickup": _normalize(cuResult, "clickup"),
}
logger.info(
"ingestion.connection.bootstrap.skipped reason=unsupported_authority authority=%s connectionId=%s",
authority, connectionId,
extra={
"event": "ingestion.connection.bootstrap.skipped",
"authority": authority,
"connectionId": connectionId,
"reason": "unsupported_authority",
},
)
return {
"connectionId": connectionId,
"authority": authority,
"skipped": True,
"reason": "unsupported_authority",
}
async def _scheduledDailyResync() -> None:
"""Enqueue a connection.bootstrap job for every active knowledge connection.
Runs once per day (default 2 AM Europe/Zurich). Each job re-walks the
connector and hands new / changed items to KnowledgeService.requestIngestion.
Unchanged items are deduplicated by content-hash and skipped automatically.
"""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
rootInterface = getRootInterface()
connections = rootInterface.getActiveKnowledgeConnections()
except Exception as exc:
logger.error("knowledge.daily_resync: could not load connections: %s", exc, exc_info=True)
return
if not connections:
logger.info("knowledge.daily_resync: no active knowledge connections — nothing to do")
return
logger.info(
"knowledge.daily_resync: enqueuing bootstrap for %d connection(s)",
len(connections),
extra={"event": "knowledge.daily_resync.started", "count": len(connections)},
)
enqueued = 0
skipped = 0
for conn in connections:
connectionId = str(conn.id)
authority = conn.authority.value if hasattr(conn.authority, "value") else str(conn.authority)
userId = str(conn.userId)
payload: Dict[str, Any] = {
"connectionId": connectionId,
"authority": authority.lower(),
"userId": userId,
}
try:
await startJob(
BOOTSTRAP_JOB_TYPE,
payload,
triggeredBy="scheduler.daily_resync",
)
enqueued += 1
logger.debug(
"knowledge.daily_resync: queued connectionId=%s authority=%s",
connectionId, authority,
)
except Exception as exc:
skipped += 1
logger.error(
"knowledge.daily_resync: failed to enqueue connectionId=%s: %s",
connectionId, exc,
)
logger.info(
"knowledge.daily_resync: done — enqueued=%d skipped=%d",
enqueued, skipped,
extra={"event": "knowledge.daily_resync.done", "enqueued": enqueued, "skipped": skipped},
)
def registerDailyResyncScheduler(*, hour: int = 2, minute: int = 0) -> None:
"""Register the daily knowledge re-sync cron job. Idempotent.
Args:
hour: Hour of day to run (023, default 2 2 AM Europe/Zurich).
minute: Minute within the hour (default 0).
"""
try:
from modules.shared.eventManagement import eventManager
eventManager.registerCron(
jobId="knowledge.daily_resync",
func=_scheduledDailyResync,
cronKwargs={"hour": str(hour), "minute": str(minute)},
)
logger.info(
"knowledge.daily_resync scheduler registered (daily %02d:%02d Europe/Zurich)",
hour, minute,
)
except Exception as exc:
logger.warning("knowledge.daily_resync scheduler registration failed (non-critical): %s", exc)
def registerKnowledgeIngestionConsumer() -> None:
"""Register callback subscribers + background job handler. Idempotent."""
global _registered
if _registered:
return
callbackRegistry.register("connection.established", _onConnectionEstablished)
callbackRegistry.register("connection.revoked", _onConnectionRevoked)
registerJobHandler(BOOTSTRAP_JOB_TYPE, _bootstrapJobHandler)
registerDailyResyncScheduler()
_registered = True
logger.info("KnowledgeIngestionConsumer registered (established/revoked + %s handler + daily resync)", BOOTSTRAP_JOB_TYPE)

View file

@ -0,0 +1,101 @@
"""Per-connection knowledge ingestion preference helpers.
Walkers call `loadConnectionPrefs(connectionId)` once at bootstrap start and
receive a `ConnectionIngestionPrefs` dataclass they can pass down into their
inner loops. All fields have safe defaults so walkers stay backward-compatible
with connections that predate the §2.6 preference schema (knowledgePreferences
is None).
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
_DEFAULT_MAX_AGE_DAYS = 90
_DEFAULT_MAIL_DEPTH = "full"
_DEFAULT_CLICKUP_SCOPE = "title_description"
@dataclass
class ConnectionIngestionPrefs:
"""Parsed per-connection preferences for knowledge ingestion walkers."""
# PII
neutralizeBeforeEmbed: bool = False
# Mail (Outlook + Gmail)
mailContentDepth: str = _DEFAULT_MAIL_DEPTH # "metadata" | "snippet" | "full"
mailIndexAttachments: bool = False
# Files (Drive / SharePoint / OneDrive)
filesIndexBinaries: bool = True
mimeAllowlist: List[str] = field(default_factory=list) # empty = all allowed
# ClickUp
clickupScope: str = _DEFAULT_CLICKUP_SCOPE # "titles" | "title_description" | "with_comments"
clickupIndexAttachments: bool = False
# Per-authority surface toggles (default everything on)
gmailEnabled: bool = True
driveEnabled: bool = True
sharepointEnabled: bool = True
outlookEnabled: bool = True
# Time window
maxAgeDays: int = _DEFAULT_MAX_AGE_DAYS # 0 = no limit
def loadConnectionPrefs(connectionId: str) -> ConnectionIngestionPrefs:
"""Load and parse per-connection preferences from the database.
Returns safe defaults for any missing or unparseable values so walkers
never fail due to missing preference data.
"""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
root = getRootInterface()
conn = root.getUserConnectionById(connectionId)
if not conn:
logger.debug("loadConnectionPrefs: connection %s not found, using defaults", connectionId)
return ConnectionIngestionPrefs()
raw: Optional[Dict[str, Any]] = getattr(conn, "knowledgePreferences", None)
if not raw or not isinstance(raw, dict):
return ConnectionIngestionPrefs()
def _bool(key: str, default: bool) -> bool:
v = raw.get(key)
return bool(v) if isinstance(v, bool) else default
def _str(key: str, allowed: List[str], default: str) -> str:
v = raw.get(key)
return v if v in allowed else default
def _int(key: str, default: int) -> int:
v = raw.get(key)
return int(v) if isinstance(v, int) else default
surface = raw.get("surfaceToggles") or {}
google_surf = surface.get("google") or {}
msft_surf = surface.get("msft") or {}
return ConnectionIngestionPrefs(
neutralizeBeforeEmbed=_bool("neutralizeBeforeEmbed", False),
mailContentDepth=_str("mailContentDepth", ["metadata", "snippet", "full"], _DEFAULT_MAIL_DEPTH),
mailIndexAttachments=_bool("mailIndexAttachments", False),
filesIndexBinaries=_bool("filesIndexBinaries", True),
mimeAllowlist=list(raw.get("mimeAllowlist") or []),
clickupScope=_str("clickupScope", ["titles", "title_description", "with_comments"], _DEFAULT_CLICKUP_SCOPE),
clickupIndexAttachments=_bool("clickupIndexAttachments", False),
gmailEnabled=bool(google_surf.get("gmail", True)),
driveEnabled=bool(google_surf.get("drive", True)),
sharepointEnabled=bool(msft_surf.get("sharepoint", True)),
outlookEnabled=bool(msft_surf.get("outlook", True)),
maxAgeDays=_int("maxAgeDays", _DEFAULT_MAX_AGE_DAYS),
)
except Exception as exc:
logger.warning("loadConnectionPrefs failed for %s, using defaults: %s", connectionId, exc)
return ConnectionIngestionPrefs()

View file

@ -0,0 +1,512 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""ClickUp bootstrap for the unified knowledge ingestion lane.
ClickUp tasks are ingested as *virtual documents* we never download file
bytes. Each task becomes a `sourceKind="clickup_task"` IngestionJob whose
`contentObjects` carry a summary header (name + status + metadata) and the
task description / text content so retrieval finds them without a live API
call.
Hierarchy traversal: workspace (team) spaces folders / folderless lists
tasks. We cap the fan-out with `maxWorkspaces` / `maxListsPerWorkspace` /
`maxTasks` and skip tasks older than `maxAgeDays` (default 180 d).
Idempotency: `date_updated` from the ClickUp task payload is a millisecond
timestamp and strictly monotonic per revision used as `contentVersion`.
"""
from __future__ import annotations
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional
logger = logging.getLogger(__name__)
MAX_TASKS_DEFAULT = 500
MAX_WORKSPACES_DEFAULT = 3
MAX_LISTS_PER_WORKSPACE_DEFAULT = 20
MAX_DESCRIPTION_CHARS_DEFAULT = 8000
MAX_AGE_DAYS_DEFAULT = 180
@dataclass
class ClickupBootstrapLimits:
maxTasks: int = MAX_TASKS_DEFAULT
maxWorkspaces: int = MAX_WORKSPACES_DEFAULT
maxListsPerWorkspace: int = MAX_LISTS_PER_WORKSPACE_DEFAULT
maxDescriptionChars: int = MAX_DESCRIPTION_CHARS_DEFAULT
# Only ingest tasks updated within the last N days. None disables filter.
maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
# Include closed/archived tasks if they still meet the recency filter.
# ClickUp `closed` tasks often carry the most useful RAG context
# ("why was this shipped the way it was?").
includeClosed: bool = True
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
# Content scope: "titles" | "title_description" | "with_comments"
clickupScope: str = "title_description"
@dataclass
class ClickupBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
workspaces: int = 0
lists: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticTaskId(connectionId: str, taskId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{taskId}".encode("utf-8")).hexdigest()[:16]
return f"cu:{connectionId[:8]}:{token}"
def _truncate(value: Any, limit: int) -> str:
text = str(value or "").strip()
if not text:
return ""
if len(text) <= limit:
return text
return text[:limit].rstrip() + "\n[truncated]"
def _isRecent(dateUpdatedMs: Any, maxAgeDays: Optional[int]) -> bool:
if not maxAgeDays:
return True
if not dateUpdatedMs:
return True
try:
ts = datetime.fromtimestamp(int(dateUpdatedMs) / 1000.0, tz=timezone.utc)
except Exception:
return True
cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
return ts >= cutoff
def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -> List[Dict[str, Any]]:
"""Header (name/status/metadata) + optional description + text_content.
`limits.clickupScope` controls how much is embedded:
- "titles": task name + status metadata only
- "title_description": header + description / text_content (default)
- "with_comments": header + description + text_content
(comments themselves are not yet fetched in v1)
"""
name = task.get("name") or f"Task {task.get('id', '')}"
status = ((task.get("status") or {}).get("status")) or ""
assignees = ", ".join(
filter(None, [
(a.get("username") or a.get("email") or "")
for a in (task.get("assignees") or [])
])
)
tags = ", ".join(filter(None, [t.get("name", "") for t in (task.get("tags") or [])]))
listInfo = task.get("list") or {}
folderInfo = task.get("folder") or {}
spaceInfo = task.get("space") or {}
dueMs = task.get("due_date")
dueIso = ""
if dueMs:
try:
dueIso = datetime.fromtimestamp(int(dueMs) / 1000.0, tz=timezone.utc).strftime("%Y-%m-%d")
except Exception:
dueIso = ""
headerLines = [
f"Task: {name}",
f"Status: {status}" if status else "",
f"List: {listInfo.get('name', '')}" if listInfo else "",
f"Folder: {folderInfo.get('name', '')}" if folderInfo else "",
f"Space: {spaceInfo.get('name', '')}" if spaceInfo else "",
f"Assignees: {assignees}" if assignees else "",
f"Tags: {tags}" if tags else "",
f"Due: {dueIso}" if dueIso else "",
f"Url: {task.get('url', '')}" if task.get("url") else "",
]
header = "\n".join(line for line in headerLines if line)
parts: List[Dict[str, Any]] = [{
"contentObjectId": "header",
"contentType": "text",
"data": header,
"contextRef": {"part": "header"},
}]
scope = getattr(limits, "clickupScope", "title_description")
if scope in ("title_description", "with_comments"):
description = _truncate(task.get("description"), limits.maxDescriptionChars)
if description:
parts.append({
"contentObjectId": "description",
"contentType": "text",
"data": description,
"contextRef": {"part": "description"},
})
# text_content is ClickUp's rendered-markdown version; include if it adds
# something beyond the plain description (common for bullet lists, checklists).
textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars)
if textContent and textContent != description:
parts.append({
"contentObjectId": "text_content",
"contentType": "text",
"data": textContent,
"contextRef": {"part": "text_content"},
})
return parts
async def bootstrapClickup(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[ClickupBootstrapLimits] = None,
) -> Dict[str, Any]:
"""Walk workspaces → lists → tasks and ingest each task as a virtual doc."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = ClickupBootstrapLimits(
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
neutralize=prefs.neutralizeBeforeEmbed,
clickupScope=prefs.clickupScope,
)
startMs = time.time()
result = ClickupBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=clickup connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "clickup",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
svc = getattr(adapter, "_svc", None)
if svc is None:
result.errors.append("adapter missing _svc instance")
return _finalizeResult(connectionId, result, startMs)
try:
teamsResp = await svc.getAuthorizedTeams()
except Exception as exc:
logger.error("clickup team discovery failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"teams: {exc}")
return _finalizeResult(connectionId, result, startMs)
teams = (teamsResp or {}).get("teams") or []
for team in teams[: limits.maxWorkspaces]:
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
break
teamId = str(team.get("id", "") or "")
if not teamId:
continue
result.workspaces += 1
try:
await _walkTeam(
svc=svc,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
team=team,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("clickup team %s walk failed: %s", teamId, exc, exc_info=True)
result.errors.append(f"team({teamId}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerClickup.connectorClickup import ClickupConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = ClickupConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("clickup")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _walkTeam(
*,
svc,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
team: Dict[str, Any],
limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
teamId = str(team.get("id", "") or "")
spacesResp = await svc.getSpaces(teamId)
spaces = (spacesResp or {}).get("spaces") or []
listsCollected: List[Dict[str, Any]] = []
for space in spaces:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
spaceId = str(space.get("id", "") or "")
if not spaceId:
continue
# Folderless lists directly under the space
folderless = await svc.getFolderlessLists(spaceId)
for lst in (folderless or {}).get("lists") or []:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
listsCollected.append({**lst, "_space": space})
# Lists inside folders
foldersResp = await svc.getFolders(spaceId)
for folder in (foldersResp or {}).get("folders") or []:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
folderId = str(folder.get("id", "") or "")
if not folderId:
continue
folderLists = await svc.getListsInFolder(folderId)
for lst in (folderLists or {}).get("lists") or []:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
listsCollected.append({**lst, "_space": space, "_folder": folder})
for lst in listsCollected:
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
return
result.lists += 1
await _walkList(
svc=svc,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
teamId=teamId,
lst=lst,
limits=limits,
result=result,
progressCb=progressCb,
)
async def _walkList(
*,
svc,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
teamId: str,
lst: Dict[str, Any],
limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
listId = str(lst.get("id", "") or "")
if not listId:
return
page = 0
while result.indexed + result.skippedDuplicate < limits.maxTasks:
resp = await svc.getTasksInList(
listId,
page=page,
include_closed=limits.includeClosed,
subtasks=True,
)
if isinstance(resp, dict) and resp.get("error"):
logger.warning("clickup tasks list=%s page=%d error: %s", listId, page, resp.get("error"))
result.errors.append(f"list({listId}): {resp.get('error')}")
return
tasks = (resp or {}).get("tasks") or []
if not tasks:
return
for task in tasks:
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
return
if not _isRecent(task.get("date_updated"), limits.maxAgeDays):
result.skippedPolicy += 1
continue
# Inject the list/folder/space metadata we already loaded.
task["list"] = task.get("list") or {"id": listId, "name": lst.get("name")}
task["folder"] = task.get("folder") or lst.get("_folder") or {}
task["space"] = task.get("space") or lst.get("_space") or {}
await _ingestTask(
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
teamId=teamId,
task=task,
limits=limits,
result=result,
progressCb=progressCb,
)
if len(tasks) < 100: # ClickUp page-size hint: fewer than 100 => last page
return
page += 1
async def _ingestTask(
*,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
teamId: str,
task: Dict[str, Any],
limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
taskId = str(task.get("id", "") or "")
if not taskId:
result.skippedPolicy += 1
return
revision = str(task.get("date_updated") or task.get("date_created") or "")
name = task.get("name") or f"Task {taskId}"
syntheticId = _syntheticTaskId(connectionId, taskId)
fileName = f"{name[:80].strip() or taskId}.task.json"
contentObjects = _buildContentObjects(task, limits)
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="clickup_task",
sourceId=syntheticId,
fileName=fileName,
mimeType="application/vnd.clickup.task+json",
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision or None,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "clickup",
"service": "clickup",
"externalItemId": taskId,
"teamId": teamId,
"listId": ((task.get("list") or {}).get("id")),
"spaceId": ((task.get("space") or {}).get("id")),
"url": task.get("url"),
"status": ((task.get("status") or {}).get("status")),
"tier": limits.clickupScope,
},
)
)
except Exception as exc:
logger.error("clickup ingestion %s failed: %s", taskId, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({taskId}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxTasks))),
f"clickup processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=clickup processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "clickup",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
def _finalizeResult(connectionId: str, result: ClickupBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=clickup connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d workspaces=%d lists=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.failed, result.workspaces, result.lists, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "clickup",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"workspaces": result.workspaces,
"lists": result.lists,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"workspaces": result.workspaces,
"lists": result.lists,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,443 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Google Drive bootstrap for the unified knowledge ingestion lane.
Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the
user's *My Drive* tree from the virtual `root` folder, downloads each
file-like item via `DriveAdapter.download` (which handles native Google docs
via export), runs the standard extraction pipeline and routes results through
`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and
`contentVersion = modifiedTime` (monotonic per-revision).
"""
from __future__ import annotations
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional
from modules.datamodels.datamodelExtraction import ExtractionOptions
logger = logging.getLogger(__name__)
MAX_ITEMS_DEFAULT = 500
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4
MAX_AGE_DAYS_DEFAULT = 365
# Google Drive uses virtual mime-types for folders and non-downloadable assets.
FOLDER_MIME = "application/vnd.google-apps.folder"
@dataclass
class GdriveBootstrapLimits:
maxItems: int = MAX_ITEMS_DEFAULT
maxBytes: int = MAX_BYTES_DEFAULT
maxFileSize: int = MAX_FILE_SIZE_DEFAULT
skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
maxDepth: int = MAX_DEPTH_DEFAULT
# Only ingest files modified within the last N days. None disables filter.
maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
# Whether to skip binary/non-text files
filesIndexBinaries: bool = True
@dataclass
class GdriveBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
bytesProcessed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
return f"gd:{connectionId[:8]}:{token}"
def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
parts = getattr(extracted, "parts", None) or []
out: List[Dict[str, Any]] = []
for part in parts:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
out.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "file",
**(getattr(part, "metadata", None) or {}),
},
})
return out
def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
if not maxAgeDays:
return True
if not modifiedIso:
# No timestamp -> be permissive (Drive native docs sometimes omit it on export).
return True
try:
# Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both.
ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00"))
except Exception:
return True
cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
if ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
return ts >= cutoff
async def bootstrapGdrive(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[GdriveBootstrapLimits] = None,
runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
"""Walk My Drive starting from the virtual root folder."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = GdriveBootstrapLimits(
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
neutralize=prefs.neutralizeBeforeEmbed,
filesIndexBinaries=prefs.filesIndexBinaries,
)
startMs = time.time()
result = GdriveBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=gdrive connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "gdrive",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
if runExtractionFn is None:
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
try:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath="/", # DriveAdapter.browse maps "" / "/" -> "root"
depth=0,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"walk: {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = GoogleConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("drive")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _walkFolder(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
folderPath: str,
depth: int,
limits: GdriveBootstrapLimits,
result: GdriveBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
if depth > limits.maxDepth:
return
try:
entries = await adapter.browse(folderPath)
except Exception as exc:
logger.warning("gdrive browse %s failed: %s", folderPath, exc)
result.errors.append(f"browse({folderPath}): {exc}")
return
for entry in entries:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
return
if result.bytesProcessed >= limits.maxBytes:
return
entryPath = getattr(entry, "path", "") or ""
metadata = getattr(entry, "metadata", {}) or {}
mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType")
if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=entryPath,
depth=depth + 1,
limits=limits,
result=result,
progressCb=progressCb,
)
continue
effectiveMime = mimeType or "application/octet-stream"
if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes):
result.skippedPolicy += 1
continue
size = int(getattr(entry, "size", 0) or 0)
if size and size > limits.maxFileSize:
result.skippedPolicy += 1
continue
modifiedTime = metadata.get("modifiedTime")
if not _isRecent(modifiedTime, limits.maxAgeDays):
result.skippedPolicy += 1
continue
externalItemId = metadata.get("id") or entryPath
revision = modifiedTime
await _ingestOne(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
entry=entry,
entryPath=entryPath,
mimeType=effectiveMime,
externalItemId=externalItemId,
revision=revision,
limits=limits,
result=result,
progressCb=progressCb,
)
async def _ingestOne(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
entry,
entryPath: str,
mimeType: str,
externalItemId: str,
revision: Optional[str],
limits: GdriveBootstrapLimits,
result: GdriveBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
syntheticFileId = _syntheticFileId(connectionId, externalItemId)
fileName = getattr(entry, "name", "") or externalItemId
try:
downloaded = await adapter.download(entryPath)
except Exception as exc:
logger.warning("gdrive download %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"download({entryPath}): {exc}")
return
# Adapter.download returns raw bytes today; guard DownloadResult shape too.
fileBytes: bytes
if isinstance(downloaded, (bytes, bytearray)):
fileBytes = bytes(downloaded)
else:
fileBytes = bytes(getattr(downloaded, "data", b"") or b"")
if getattr(downloaded, "mimeType", None):
mimeType = downloaded.mimeType # export may have changed the type
if not fileBytes:
result.failed += 1
return
if len(fileBytes) > limits.maxFileSize:
result.skippedPolicy += 1
return
result.bytesProcessed += len(fileBytes)
try:
extracted = runExtractionFn(
fileBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("gdrive extraction %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"extract({entryPath}): {exc}")
return
contentObjects = _toContentObjects(extracted, fileName)
if not contentObjects:
result.skippedPolicy += 1
return
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="gdrive_item",
sourceId=syntheticFileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "google",
"service": "drive",
"externalItemId": externalItemId,
"entryPath": entryPath,
"tier": "body",
},
)
)
except Exception as exc:
logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({entryPath}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
f"gdrive processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "gdrive",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.failed, result.bytesProcessed, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "gdrive",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytes": result.bytesProcessed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytesProcessed": result.bytesProcessed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,606 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Gmail bootstrap for the unified knowledge ingestion lane.
Mirrors the Outlook pilot (see subConnectorSyncOutlook.py) but talks to Google
Mail's REST API. Messages become `sourceKind="gmail_message"` virtual documents
with header / snippet / cleaned body content-objects; attachments are optional
child jobs with `sourceKind="gmail_attachment"`.
Idempotency: Gmail's stable `historyId` (or `internalDate` as fallback) is
passed as `contentVersion`, so rerunning the bootstrap yields
`ingestion.skipped.duplicate` for unchanged messages.
"""
from __future__ import annotations
import asyncio
import base64
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional
from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
logger = logging.getLogger(__name__)
MAX_MESSAGES_DEFAULT = 500
MAX_BODY_CHARS_DEFAULT = 8000
MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
DEFAULT_LABELS = ("INBOX", "SENT")
@dataclass
class GmailBootstrapLimits:
maxMessages: int = MAX_MESSAGES_DEFAULT
labels: tuple = DEFAULT_LABELS
maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
includeAttachments: bool = False
maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
# Only fetch messages newer than N days. None disables filter.
maxAgeDays: Optional[int] = 90
# Content depth: "metadata" | "snippet" | "full"
mailContentDepth: str = "full"
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
@dataclass
class GmailBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
attachmentsIndexed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticMessageId(connectionId: str, messageId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
return f"gm:{connectionId[:8]}:{token}"
def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
token = hashlib.sha256(
f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
).hexdigest()[:16]
return f"ga:{connectionId[:8]}:{token}"
def _decodeBase64Url(data: str) -> bytes:
if not data:
return b""
# Gmail uses URL-safe base64 without padding.
padding = 4 - (len(data) % 4)
if padding != 4:
data = data + ("=" * padding)
try:
return base64.urlsafe_b64decode(data)
except Exception:
return b""
def _walkPayloadForBody(payload: Dict[str, Any]) -> Dict[str, str]:
"""Return {"text": ..., "html": ...} by walking MIME parts.
Gmail `payload` is a tree of parts. We prefer `text/plain` for the cleaned
body, but capture `text/html` as a fallback so `cleanEmailBody` can strip
markup if plain is missing.
"""
found: Dict[str, str] = {"text": "", "html": ""}
def _walk(part: Dict[str, Any]) -> None:
mime = (part.get("mimeType") or "").lower()
body = part.get("body") or {}
raw = body.get("data") or ""
if raw and mime.startswith("text/"):
decoded = _decodeBase64Url(raw).decode("utf-8", errors="replace")
key = "text" if mime == "text/plain" else ("html" if mime == "text/html" else "")
if key and not found[key]:
found[key] = decoded
for sub in part.get("parts") or []:
_walk(sub)
_walk(payload or {})
return found
def _headerMap(payload: Dict[str, Any]) -> Dict[str, str]:
return {
(h.get("name") or "").lower(): (h.get("value") or "")
for h in (payload.get("headers") or [])
}
def _buildContentObjects(
message: Dict[str, Any],
maxBodyChars: int,
mailContentDepth: str = "full",
) -> List[Dict[str, Any]]:
"""Build content objects for a Gmail message.
`mailContentDepth` controls how much is embedded:
- "metadata": header only (subject, from, to, date)
- "snippet": header + Gmail snippet (~155 chars, no full body)
- "full": header + snippet + cleaned full body (default)
"""
payload = message.get("payload") or {}
headers = _headerMap(payload)
subject = headers.get("subject") or "(no subject)"
fromAddr = headers.get("from") or ""
toAddr = headers.get("to") or ""
ccAddr = headers.get("cc") or ""
date = headers.get("date") or ""
snippet = message.get("snippet") or ""
parts: List[Dict[str, Any]] = []
header = (
f"Subject: {subject}\n"
f"From: {fromAddr}\n"
f"To: {toAddr}\n"
+ (f"Cc: {ccAddr}\n" if ccAddr else "")
+ f"Date: {date}"
)
parts.append({
"contentObjectId": "header",
"contentType": "text",
"data": header,
"contextRef": {"part": "header"},
})
if mailContentDepth in ("snippet", "full") and snippet:
parts.append({
"contentObjectId": "snippet",
"contentType": "text",
"data": snippet,
"contextRef": {"part": "snippet"},
})
if mailContentDepth == "full":
bodies = _walkPayloadForBody(payload)
rawBody = bodies["text"] or bodies["html"]
cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else ""
if cleanedBody:
parts.append({
"contentObjectId": "body",
"contentType": "text",
"data": cleanedBody,
"contextRef": {"part": "body"},
})
return parts
async def bootstrapGmail(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[GmailBootstrapLimits] = None,
googleGetFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
"""Enumerate Gmail labels (INBOX + SENT default) and ingest messages."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = GmailBootstrapLimits(
includeAttachments=prefs.mailIndexAttachments,
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
mailContentDepth=prefs.mailContentDepth,
neutralize=prefs.neutralizeBeforeEmbed,
)
startMs = time.time()
result = GmailBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=gmail connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "gmail",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
if googleGetFn is None:
from modules.connectors.providerGoogle.connectorGoogle import _googleGet as _defaultGet
token = getattr(adapter, "_token", "")
async def googleGetFn(url: str) -> Dict[str, Any]: # type: ignore[no-redef]
return await _defaultGet(token, url)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
for labelId in limits.labels:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
try:
await _ingestLabel(
googleGetFn=googleGetFn,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
labelId=labelId,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True)
result.errors.append(f"label({labelId}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = GoogleConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("gmail")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _ingestLabel(
*,
googleGetFn,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
labelId: str,
limits: GmailBootstrapLimits,
result: GmailBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
if remaining <= 0:
return
pageSize = min(100, remaining)
query = ""
if limits.maxAgeDays:
cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
# Gmail uses YYYY/MM/DD.
query = f"after:{cutoff.strftime('%Y/%m/%d')}"
baseUrl = (
"https://gmail.googleapis.com/gmail/v1/users/me/messages"
f"?labelIds={labelId}&maxResults={pageSize}"
)
if query:
baseUrl = f"{baseUrl}&q={query}"
nextPageToken: Optional[str] = None
while (result.indexed + result.skippedDuplicate) < limits.maxMessages:
url = baseUrl if not nextPageToken else f"{baseUrl}&pageToken={nextPageToken}"
page = await googleGetFn(url)
if not isinstance(page, dict) or "error" in page:
err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
logger.warning("gmail list page error for label %s: %s", labelId, err)
result.errors.append(f"list({labelId}): {err}")
return
messageStubs = page.get("messages") or []
for stub in messageStubs:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
msgId = stub.get("id")
if not msgId:
continue
detailUrl = (
f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{msgId}?format=full"
)
detail = await googleGetFn(detailUrl)
if not isinstance(detail, dict) or "error" in detail:
result.failed += 1
continue
await _ingestMessage(
googleGetFn=googleGetFn,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
labelId=labelId,
message=detail,
limits=limits,
result=result,
progressCb=progressCb,
)
nextPageToken = page.get("nextPageToken")
if not nextPageToken:
break
async def _ingestMessage(
*,
googleGetFn,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
labelId: str,
message: Dict[str, Any],
limits: GmailBootstrapLimits,
result: GmailBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
messageId = message.get("id")
if not messageId:
result.skippedPolicy += 1
return
revision = message.get("historyId") or message.get("internalDate")
headers = _headerMap(message.get("payload") or {})
subject = headers.get("subject") or "(no subject)"
syntheticId = _syntheticMessageId(connectionId, messageId)
fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"
contentObjects = _buildContentObjects(
message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth
)
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="gmail_message",
sourceId=syntheticId,
fileName=fileName,
mimeType="message/rfc822",
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=str(revision) if revision else None,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "google",
"service": "gmail",
"externalItemId": messageId,
"label": labelId,
"threadId": message.get("threadId"),
"tier": limits.mailContentDepth,
},
)
)
except Exception as exc:
logger.error("gmail ingestion %s failed: %s", messageId, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({messageId}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if limits.includeAttachments:
try:
await _ingestAttachments(
googleGetFn=googleGetFn,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
message=message,
parentSyntheticId=syntheticId,
limits=limits,
result=result,
)
except Exception as exc:
logger.warning("gmail attachments %s failed: %s", messageId, exc)
result.errors.append(f"attachments({messageId}): {exc}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
f"gmail processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=gmail processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "gmail",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
await asyncio.sleep(0)
async def _ingestAttachments(
*,
googleGetFn,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
message: Dict[str, Any],
parentSyntheticId: str,
limits: GmailBootstrapLimits,
result: GmailBootstrapResult,
) -> None:
"""Child ingestion jobs for file attachments. Skips inline images (cid: refs)."""
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
from modules.datamodels.datamodelExtraction import ExtractionOptions
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
messageId = message.get("id") or ""
def _collectAttachmentStubs(part: Dict[str, Any], acc: List[Dict[str, Any]]) -> None:
filename = part.get("filename") or ""
body = part.get("body") or {}
attId = body.get("attachmentId")
if filename and attId:
acc.append({
"filename": filename,
"mimeType": part.get("mimeType") or "application/octet-stream",
"attachmentId": attId,
"size": int(body.get("size") or 0),
})
for sub in part.get("parts") or []:
_collectAttachmentStubs(sub, acc)
stubs: List[Dict[str, Any]] = []
_collectAttachmentStubs(message.get("payload") or {}, stubs)
if not stubs:
return
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
for stub in stubs:
if stub["size"] and stub["size"] > limits.maxAttachmentBytes:
result.skippedPolicy += 1
continue
attUrl = (
f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{messageId}"
f"/attachments/{stub['attachmentId']}"
)
detail = await googleGetFn(attUrl)
if not isinstance(detail, dict) or "error" in detail:
result.failed += 1
continue
rawBytes = _decodeBase64Url(detail.get("data") or "")
if not rawBytes:
continue
fileName = stub["filename"]
mimeType = stub["mimeType"]
syntheticId = _syntheticAttachmentId(connectionId, messageId, stub["attachmentId"])
try:
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
rawBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("gmail attachment extract %s failed: %s", stub["attachmentId"], exc)
result.failed += 1
continue
contentObjects: List[Dict[str, Any]] = []
for part in getattr(extracted, "parts", None) or []:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
contentObjects.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "attachment",
**(getattr(part, "metadata", None) or {}),
},
})
if not contentObjects:
result.skippedPolicy += 1
continue
try:
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="gmail_attachment",
sourceId=syntheticId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
provenance={
"connectionId": connectionId,
"authority": "google",
"service": "gmail",
"parentId": parentSyntheticId,
"externalItemId": stub["attachmentId"],
"parentMessageId": messageId,
},
)
)
result.attachmentsIndexed += 1
except Exception as exc:
logger.warning("gmail attachment ingest %s failed: %s", stub["attachmentId"], exc)
result.failed += 1
def _finalizeResult(connectionId: str, result: GmailBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=gmail connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.attachmentsIndexed, result.failed, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "gmail",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,576 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Outlook bootstrap for the unified knowledge ingestion lane.
Unlike SharePoint, Outlook messages are "virtual documents" we never persist
file bytes in the store. Each message becomes a `sourceKind="outlook_message"`
IngestionJob whose `contentObjects` carry the header, snippet and cleaned body
so retrieval can show a compact answer without fetching Graph again.
Attachments are optional (`includeAttachments` limit flag) and enqueued as
child jobs with `sourceKind="outlook_attachment"` + `provenance.parentId`.
"""
from __future__ import annotations
import asyncio
import hashlib
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional
from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
logger = logging.getLogger(__name__)
MAX_MESSAGES_DEFAULT = 500
MAX_FOLDERS_DEFAULT = 5
MAX_BODY_CHARS_DEFAULT = 8000
MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
WELL_KNOWN_FOLDERS = ("inbox", "sentitems")
@dataclass
class OutlookBootstrapLimits:
maxMessages: int = MAX_MESSAGES_DEFAULT
maxFolders: int = MAX_FOLDERS_DEFAULT
maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
includeAttachments: bool = False
maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
# Only fetch messages newer than N days. None disables filter.
maxAgeDays: Optional[int] = 90
# Content depth: "metadata" | "snippet" | "full"
mailContentDepth: str = "full"
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
@dataclass
class OutlookBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
attachmentsIndexed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticMessageId(connectionId: str, messageId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
return f"om:{connectionId[:8]}:{token}"
def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
token = hashlib.sha256(
f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
).hexdigest()[:16]
return f"oa:{connectionId[:8]}:{token}"
def _extractRecipient(recipient: Dict[str, Any]) -> str:
email = (recipient or {}).get("emailAddress") or {}
name = email.get("name") or ""
addr = email.get("address") or ""
if name and addr:
return f"{name} <{addr}>"
return addr or name
def _joinRecipients(recipients: List[Dict[str, Any]]) -> str:
return ", ".join(filter(None, [_extractRecipient(r) for r in recipients or []]))
def _buildContentObjects(
message: Dict[str, Any],
maxBodyChars: int,
mailContentDepth: str = "full",
) -> List[Dict[str, Any]]:
"""Build content objects for an Outlook message.
`mailContentDepth` mirrors the Gmail walker:
- "metadata": header only
- "snippet": header + bodyPreview (~255 chars)
- "full": header + snippet + cleaned body (default)
"""
subject = message.get("subject") or "(no subject)"
fromAddr = _extractRecipient(message.get("from") or {})
toAddr = _joinRecipients(message.get("toRecipients") or [])
ccAddr = _joinRecipients(message.get("ccRecipients") or [])
received = message.get("receivedDateTime") or ""
snippet = message.get("bodyPreview") or ""
parts: List[Dict[str, Any]] = []
header = (
f"Subject: {subject}\n"
f"From: {fromAddr}\n"
f"To: {toAddr}\n"
+ (f"Cc: {ccAddr}\n" if ccAddr else "")
+ f"Date: {received}"
)
parts.append({
"contentObjectId": "header",
"contentType": "text",
"data": header,
"contextRef": {"part": "header"},
})
if mailContentDepth in ("snippet", "full") and snippet:
parts.append({
"contentObjectId": "snippet",
"contentType": "text",
"data": snippet,
"contextRef": {"part": "snippet"},
})
if mailContentDepth == "full":
body = message.get("body") or {}
bodyContent = body.get("content") or ""
cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) if bodyContent else ""
if cleanedBody:
parts.append({
"contentObjectId": "body",
"contentType": "text",
"data": cleanedBody,
"contextRef": {"part": "body"},
})
return parts
async def bootstrapOutlook(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[OutlookBootstrapLimits] = None,
) -> Dict[str, Any]:
"""Enumerate Outlook folders (inbox + sent by default) and ingest messages."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = OutlookBootstrapLimits(
includeAttachments=prefs.mailIndexAttachments,
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
mailContentDepth=prefs.mailContentDepth,
neutralize=prefs.neutralizeBeforeEmbed,
)
startMs = time.time()
result = OutlookBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=outlook connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "outlook",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
folderIds = await _selectFolderIds(adapter, limits)
for folderId in folderIds:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
try:
await _ingestFolder(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderId=folderId,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("outlook ingestion folder %s failed: %s", folderId, exc, exc_info=True)
result.errors.append(f"folder({folderId}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerMsft.connectorMsft import MsftConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = MsftConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("outlook")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _selectFolderIds(adapter, limits: OutlookBootstrapLimits) -> List[str]:
"""Prefer well-known folders (inbox, sentitems); fall back to browse()."""
folderIds: List[str] = []
for wellKnown in WELL_KNOWN_FOLDERS:
if len(folderIds) >= limits.maxFolders:
break
try:
row = await adapter._graphGet(f"me/mailFolders/{wellKnown}")
except Exception:
row = None
if isinstance(row, dict) and "error" not in row and row.get("id"):
folderIds.append(row["id"])
if len(folderIds) < limits.maxFolders:
try:
entries = await adapter.browse("/")
except Exception:
entries = []
for entry in entries:
metadata = getattr(entry, "metadata", {}) or {}
fid = metadata.get("id")
if fid and fid not in folderIds:
folderIds.append(fid)
if len(folderIds) >= limits.maxFolders:
break
return folderIds
async def _ingestFolder(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
folderId: str,
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
if remaining <= 0:
return
pageSize = min(100, remaining)
select = (
"id,subject,from,toRecipients,ccRecipients,receivedDateTime,"
"bodyPreview,body,internetMessageId,hasAttachments,changeKey"
)
endpoint: Optional[str] = (
f"me/mailFolders/{folderId}/messages"
f"?$top={pageSize}&$orderby=receivedDateTime desc&$select={select}"
)
# Keep header-based age filter in Graph itself to avoid shipping ancient
# messages we'd discard client-side.
if limits.maxAgeDays:
from datetime import datetime, timezone, timedelta
cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
cutoffIso = cutoff.strftime("%Y-%m-%dT%H:%M:%SZ")
endpoint = f"{endpoint}&$filter=receivedDateTime ge {cutoffIso}"
while endpoint and (result.indexed + result.skippedDuplicate) < limits.maxMessages:
try:
page = await adapter._graphGet(endpoint)
except Exception as exc:
logger.warning("outlook graph page failed for folder %s: %s", folderId, exc)
result.errors.append(f"graph({folderId}): {exc}")
return
if not isinstance(page, dict) or "error" in page:
err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
logger.warning("outlook graph page error for folder %s: %s", folderId, err)
result.errors.append(f"graph({folderId}): {err}")
return
for message in page.get("value", []) or []:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
await _ingestMessage(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
message=message,
limits=limits,
result=result,
progressCb=progressCb,
)
nextLink = page.get("@odata.nextLink")
if not nextLink:
break
# Strip Graph base so adapter._graphGet accepts the relative path.
from modules.connectors.providerMsft.connectorMsft import _stripGraphBase
endpoint = _stripGraphBase(nextLink)
async def _ingestMessage(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
message: Dict[str, Any],
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
messageId = message.get("id")
if not messageId:
result.skippedPolicy += 1
return
revision = message.get("changeKey") or message.get("internetMessageId")
subject = message.get("subject") or "(no subject)"
syntheticId = _syntheticMessageId(connectionId, messageId)
fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"
contentObjects = _buildContentObjects(
message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth
)
# Always at least the header is emitted, so `contentObjects` is non-empty.
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="outlook_message",
sourceId=syntheticId,
fileName=fileName,
mimeType="message/rfc822",
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "msft",
"service": "outlook",
"externalItemId": messageId,
"internetMessageId": message.get("internetMessageId"),
"tier": limits.mailContentDepth,
},
)
)
except Exception as exc:
logger.error("outlook ingestion %s failed: %s", messageId, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({messageId}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if limits.includeAttachments and message.get("hasAttachments"):
try:
await _ingestAttachments(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
messageId=messageId,
parentSyntheticId=syntheticId,
limits=limits,
result=result,
)
except Exception as exc:
logger.warning("outlook attachments %s failed: %s", messageId, exc)
result.errors.append(f"attachments({messageId}): {exc}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
f"outlook processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=outlook processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "outlook",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
await asyncio.sleep(0)
async def _ingestAttachments(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
messageId: str,
parentSyntheticId: str,
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
) -> None:
"""Child ingestion jobs for file attachments (skip inline & oversized)."""
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
from modules.datamodels.datamodelExtraction import ExtractionOptions
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
import base64
page = await adapter._graphGet(f"me/messages/{messageId}/attachments")
if not isinstance(page, dict) or "error" in page:
return
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
for attachment in page.get("value", []) or []:
if attachment.get("@odata.type") != "#microsoft.graph.fileAttachment":
continue
if attachment.get("isInline"):
continue
size = int(attachment.get("size") or 0)
if size and size > limits.maxAttachmentBytes:
result.skippedPolicy += 1
continue
contentBytesB64 = attachment.get("contentBytes")
if not contentBytesB64:
continue
try:
rawBytes = base64.b64decode(contentBytesB64)
except Exception:
result.skippedPolicy += 1
continue
fileName = attachment.get("name") or "attachment"
mimeType = attachment.get("contentType") or "application/octet-stream"
attachmentId = attachment.get("id") or fileName
syntheticId = _syntheticAttachmentId(connectionId, messageId, attachmentId)
try:
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
rawBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("outlook attachment extract %s failed: %s", attachmentId, exc)
result.failed += 1
continue
contentObjects: List[Dict[str, Any]] = []
for part in getattr(extracted, "parts", None) or []:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
contentObjects.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "attachment",
**(getattr(part, "metadata", None) or {}),
},
})
if not contentObjects:
result.skippedPolicy += 1
continue
try:
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="outlook_attachment",
sourceId=syntheticId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "msft",
"service": "outlook",
"parentId": parentSyntheticId,
"externalItemId": attachmentId,
"parentMessageId": messageId,
},
)
)
result.attachmentsIndexed += 1
except Exception as exc:
logger.warning("outlook attachment ingest %s failed: %s", attachmentId, exc)
result.failed += 1
def _finalizeResult(connectionId: str, result: OutlookBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=outlook connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.attachmentsIndexed, result.failed, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "outlook",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,433 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""SharePoint bootstrap for the unified knowledge ingestion lane.
Walks the SharePoint drive(s) reachable via a UserConnection, downloads each
file-like item, runs the standard content extraction pipeline and hands the
result to `KnowledgeService.requestIngestion`. Idempotency is provided by the
ingestion façade itself; repeat bootstraps therefore produce
`ingestion.skipped.duplicate` for every unchanged item because we pass the
Graph `eTag` as `contentVersion`.
"""
from __future__ import annotations
import asyncio
import hashlib
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional
from modules.datamodels.datamodelExtraction import ExtractionOptions
logger = logging.getLogger(__name__)
MAX_ITEMS_DEFAULT = 500
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4
MAX_SITES_DEFAULT = 3
@dataclass
class SharepointBootstrapLimits:
maxItems: int = MAX_ITEMS_DEFAULT
maxBytes: int = MAX_BYTES_DEFAULT
maxFileSize: int = MAX_FILE_SIZE_DEFAULT
skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
maxDepth: int = MAX_DEPTH_DEFAULT
maxSites: int = MAX_SITES_DEFAULT
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
@dataclass
class SharepointBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
bytesProcessed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
"""Deterministic synthetic FileContentIndex id for a SharePoint item.
Stable across bootstraps idempotency works; independent of file name so
moves/renames don't duplicate chunks.
"""
token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
return f"sp:{connectionId[:8]}:{token}"
def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
"""Translate ExtractionResult → content objects accepted by requestIngestion."""
parts = getattr(extracted, "parts", None) or []
out: List[Dict[str, Any]] = []
for part in parts:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
out.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "file",
**(getattr(part, "metadata", None) or {}),
},
})
return out
async def bootstrapSharepoint(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[SharepointBootstrapLimits] = None,
runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
"""Enumerate SharePoint drives and ingest every reachable file via the façade.
Parameters allow injection for tests; production callers pass only
`connectionId` (and optionally a progressCb) and everything else is
resolved against the registered services.
"""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = SharepointBootstrapLimits(neutralize=prefs.neutralizeBeforeEmbed)
startMs = time.time()
result = SharepointBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=sharepoint connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "sharepoint",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
if runExtractionFn is None:
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
try:
sites = await adapter.browse("/", limit=limits.maxSites)
except Exception as exc:
logger.error("sharepoint site discovery failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"site_discovery: {exc}")
return _finalizeResult(connectionId, result, startMs)
for site in sites[: limits.maxSites]:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
break
sitePath = getattr(site, "path", "") or ""
try:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=sitePath,
depth=0,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("sharepoint walk failed for site %s: %s", sitePath, exc, exc_info=True)
result.errors.append(f"walk({sitePath}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
"""Load connection, instantiate SharepointAdapter, and build a KnowledgeService.
Runs with root privileges: bootstrap is a system operation triggered by an
authenticated user via callback; it must not be gated by a per-user
service-center context.
"""
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerMsft.connectorMsft import MsftConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = MsftConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("sharepoint")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _walkFolder(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
folderPath: str,
depth: int,
limits: SharepointBootstrapLimits,
result: SharepointBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
if depth > limits.maxDepth:
return
try:
entries = await adapter.browse(folderPath)
except Exception as exc:
logger.warning("sharepoint browse %s failed: %s", folderPath, exc)
result.errors.append(f"browse({folderPath}): {exc}")
return
for entry in entries:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
return
if result.bytesProcessed >= limits.maxBytes:
return
entryPath = getattr(entry, "path", "") or ""
if getattr(entry, "isFolder", False):
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=entryPath,
depth=depth + 1,
limits=limits,
result=result,
progressCb=progressCb,
)
continue
mimeType = getattr(entry, "mimeType", None) or "application/octet-stream"
if any(mimeType.startswith(prefix) for prefix in limits.skipMimePrefixes):
result.skippedPolicy += 1
continue
size = int(getattr(entry, "size", 0) or 0)
if size and size > limits.maxFileSize:
result.skippedPolicy += 1
continue
metadata = getattr(entry, "metadata", {}) or {}
externalItemId = metadata.get("id") or entryPath
revision = metadata.get("revision") or metadata.get("lastModifiedDateTime")
await _ingestOne(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
entry=entry,
entryPath=entryPath,
mimeType=mimeType,
externalItemId=externalItemId,
revision=revision,
limits=limits,
result=result,
progressCb=progressCb,
)
async def _ingestOne(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
entry,
entryPath: str,
mimeType: str,
externalItemId: str,
revision: Optional[str],
limits: SharepointBootstrapLimits,
result: SharepointBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
syntheticFileId = _syntheticFileId(connectionId, externalItemId)
fileName = getattr(entry, "name", "") or externalItemId
try:
fileBytes = await adapter.download(entryPath)
except Exception as exc:
logger.warning("sharepoint download %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"download({entryPath}): {exc}")
return
if not fileBytes:
result.failed += 1
return
result.bytesProcessed += len(fileBytes)
try:
extracted = runExtractionFn(
fileBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("sharepoint extraction %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"extract({entryPath}): {exc}")
return
contentObjects = _toContentObjects(extracted, fileName)
if not contentObjects:
result.skippedPolicy += 1
return
provenance: Dict[str, Any] = {
"connectionId": connectionId,
"authority": "msft",
"service": "sharepoint",
"externalItemId": externalItemId,
"externalPath": entryPath,
"revision": revision,
}
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="sharepoint_item",
sourceId=syntheticFileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
neutralize=limits.neutralize,
provenance=provenance,
)
)
except Exception as exc:
logger.error("sharepoint ingestion %s failed: %s", entryPath, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({entryPath}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if handle.error:
result.errors.append(f"ingest({entryPath}): {handle.error}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
f"sharepoint processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=sharepoint processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "sharepoint",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
# Yield so the event loop can interleave other tasks (download/extract are
# CPU-ish and extraction uses sync libs; cooperative scheduling prevents
# starving other workers).
await asyncio.sleep(0)
def _finalizeResult(connectionId: str, result: SharepointBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=sharepoint connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed,
durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "sharepoint",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytesProcessed": result.bytesProcessed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,107 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Text normalisation utilities used by knowledge ingestion.
The email body cleaning logic is intentionally regex-based and works on plain
text after an HTMLtext pass so we never store unsanitised HTML/JS in the
knowledge store and retrieval stays robust (no extraneous markup tokens
eating embedding budget).
"""
from __future__ import annotations
import re
from typing import Optional
DEFAULT_MAX_CHARS = 8000
_QUOTE_MARKER_PATTERNS = [
re.compile(r"^\s*(?:On\s.+?\swrote:)\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*(?:Am\s.+?\sschrieb.+?:)\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*-{2,}\s*Original\s*Message\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*-{2,}\s*Urspr.+Nachricht\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*From:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Von:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Sent:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Gesendet:\s+.+$", re.MULTILINE | re.IGNORECASE),
]
_SIGNATURE_MARKERS = [
re.compile(r"^\s*-{2,}\s*$", re.MULTILINE),
re.compile(r"^\s*—\s*$", re.MULTILINE),
re.compile(r"^\s*Best regards\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Kind regards\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Mit freundlichen Gr[üu]ßen\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Viele Gr[üu]ße\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Best,\s*$", re.MULTILINE | re.IGNORECASE),
]
def _htmlToText(html: str) -> str:
"""Prefer BeautifulSoup when available, fall back to regex."""
try:
from bs4 import BeautifulSoup # type: ignore
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "head"]):
tag.decompose()
for br in soup.find_all(["br"]):
br.replace_with("\n")
for p in soup.find_all(["p", "div", "li", "tr"]):
p.append("\n")
text = soup.get_text()
except Exception:
# Minimal fallback: strip tags crudely.
text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
text = re.sub(r"</(?:p|div|li|tr)>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
# Collapse non-breaking + zero-width whitespace.
text = text.replace("\u00a0", " ").replace("\u200b", "")
return text
def _stripQuotedThread(text: str) -> str:
"""Remove reply-chain content so only the author's own contribution remains."""
earliest = len(text)
for pattern in _QUOTE_MARKER_PATTERNS:
match = pattern.search(text)
if match and match.start() < earliest:
earliest = match.start()
# Drop any block starting with "> " quoted lines (often Gmail/Thunderbird).
quotedBlock = re.search(r"^(?:\s*>.*\n?)+", text, re.MULTILINE)
if quotedBlock and quotedBlock.start() < earliest:
earliest = quotedBlock.start()
return text[:earliest].rstrip()
def _stripSignature(text: str) -> str:
earliest = len(text)
for pattern in _SIGNATURE_MARKERS:
match = pattern.search(text)
if match and match.start() < earliest:
earliest = match.start()
return text[:earliest].rstrip()
def _collapseWhitespace(text: str) -> str:
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def cleanEmailBody(html: str, maxChars: Optional[int] = DEFAULT_MAX_CHARS) -> str:
"""Return a compact plain-text view of an email body suitable for embedding.
Steps: HTML text, remove quoted reply chain, remove signature, collapse
whitespace, truncate to maxChars. Always returns a string (possibly empty).
"""
if not html:
return ""
text = _htmlToText(html) if "<" in html and ">" in html else html
text = _stripQuotedThread(text)
text = _stripSignature(text)
text = _collapseWhitespace(text)
if maxChars and len(text) > maxChars:
text = text[:maxChars].rstrip() + ""
return text

View file

@ -0,0 +1,203 @@
#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Bootstrap ClickUp tests with a fake service + knowledge service.
Verifies:
- Teams spaces lists (folderless + folder-based) tasks traversal.
- Each task produces a `requestIngestion` call with `sourceKind="clickup_task"`
and header + description content-objects.
- `date_updated` is forwarded as contentVersion idempotency.
- Recency filter drops tasks older than `maxAgeDays`.
- maxWorkspaces / maxListsPerWorkspace / maxTasks caps are respected.
"""
import asyncio
import os
import sys
import time
from types import SimpleNamespace
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
bootstrapClickup,
ClickupBootstrapLimits,
_syntheticTaskId,
)
def _nowMs(offsetDays: int = 0) -> str:
return str(int((time.time() + offsetDays * 86400) * 1000))
class _FakeClickupService:
"""Records API calls; serves a canned 1-team / 1-space / 1-list / 2-task layout."""
def __init__(self, taskCount=2, oldTask=False):
self._taskCount = taskCount
self._oldTask = oldTask # when True, the second task is 400 days old
self.calls = []
async def getAuthorizedTeams(self):
self.calls.append(("getAuthorizedTeams",))
return {"teams": [{"id": "team-1", "name": "Acme"}]}
async def getSpaces(self, team_id: str):
self.calls.append(("getSpaces", team_id))
return {"spaces": [{"id": "space-1", "name": "Engineering"}]}
async def getFolderlessLists(self, space_id: str):
self.calls.append(("getFolderlessLists", space_id))
return {"lists": [{"id": "list-1", "name": "Sprint 1"}]}
async def getFolders(self, space_id: str):
self.calls.append(("getFolders", space_id))
return {"folders": [{"id": "folder-1", "name": "Subproject"}]}
async def getListsInFolder(self, folder_id: str):
self.calls.append(("getListsInFolder", folder_id))
return {"lists": [{"id": "list-2", "name": "Sub-tasks"}]}
async def getTasksInList(self, list_id: str, *, page=0, include_closed=False, subtasks=True):
self.calls.append(("getTasksInList", list_id, page, include_closed))
if page > 0:
return {"tasks": []}
tasks = []
for i in range(self._taskCount):
tid = f"{list_id}-task-{i}"
offsetDays = -400 if (self._oldTask and i == 1) else 0
tasks.append({
"id": tid,
"name": f"Task {i} of {list_id}",
"description": f"Plain description for task {i}",
"text_content": f"Rich content for task {i}",
"status": {"status": "open" if i == 0 else "closed"},
"assignees": [{"username": "alice"}],
"tags": [{"name": "urgent"}],
"date_updated": _nowMs(offsetDays),
"date_created": _nowMs(-1),
"url": f"https://app.clickup.com/t/{tid}",
})
return {"tasks": tasks}
class _FakeKnowledgeService:
def __init__(self, duplicateIds=None):
self.calls = []
self._duplicates = duplicateIds or set()
async def requestIngestion(self, job):
self.calls.append(job)
status = "duplicate" if job.sourceId in self._duplicates else "indexed"
return SimpleNamespace(
jobId=job.sourceId, status=status, contentHash="h",
fileId=job.sourceId, index=None, error=None,
)
def _adapter(svc):
return SimpleNamespace(_svc=svc)
def test_bootstrap_walks_team_space_lists_and_tasks():
svc = _FakeClickupService(taskCount=2)
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapClickup(
connectionId="c1",
adapter=_adapter(svc),
connection=connection,
knowledgeService=knowledge,
limits=ClickupBootstrapLimits(maxAgeDays=None),
)
result = asyncio.run(_run())
# 2 lists (folderless list-1 + folder's list-2) × 2 tasks each = 4 tasks
assert result["indexed"] == 4
assert result["workspaces"] == 1
assert result["lists"] == 2
sourceIds = {c.sourceId for c in knowledge.calls}
assert len(sourceIds) == 4
for job in knowledge.calls:
assert job.sourceKind == "clickup_task"
assert job.mimeType == "application/vnd.clickup.task+json"
assert job.mandateId == "m1"
assert job.provenance["connectionId"] == "c1"
assert job.provenance["authority"] == "clickup"
assert job.provenance["teamId"] == "team-1"
assert job.contentVersion # numeric millisecond string
# At least the header content-object is present.
ids = [co["contentObjectId"] for co in job.contentObjects]
assert "header" in ids
def test_bootstrap_reports_duplicates_on_second_run():
svc = _FakeClickupService(taskCount=1)
duplicates = {
_syntheticTaskId("c1", "list-1-task-0"),
_syntheticTaskId("c1", "list-2-task-0"),
}
knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapClickup(
connectionId="c1",
adapter=_adapter(svc),
connection=connection,
knowledgeService=knowledge,
limits=ClickupBootstrapLimits(maxAgeDays=None),
)
result = asyncio.run(_run())
assert result["indexed"] == 0
assert result["skippedDuplicate"] == 2
def test_bootstrap_skips_tasks_older_than_maxAgeDays():
svc = _FakeClickupService(taskCount=2, oldTask=True)
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapClickup(
connectionId="c1",
adapter=_adapter(svc),
connection=connection,
knowledgeService=knowledge,
limits=ClickupBootstrapLimits(maxAgeDays=180),
)
result = asyncio.run(_run())
# 2 lists × (1 recent + 1 skipped old) = 2 indexed + 2 skippedPolicy
assert result["indexed"] == 2
assert result["skippedPolicy"] == 2
def test_bootstrap_maxTasks_caps_ingestion():
svc = _FakeClickupService(taskCount=2)
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapClickup(
connectionId="c1",
adapter=_adapter(svc),
connection=connection,
knowledgeService=knowledge,
limits=ClickupBootstrapLimits(maxAgeDays=None, maxTasks=3),
)
result = asyncio.run(_run())
assert result["indexed"] == 3
if __name__ == "__main__":
test_bootstrap_walks_team_space_lists_and_tasks()
test_bootstrap_reports_duplicates_on_second_run()
test_bootstrap_skips_tasks_older_than_maxAgeDays()
test_bootstrap_maxTasks_caps_ingestion()
print("OK — bootstrapClickup tests passed")

View file

@ -0,0 +1,225 @@
#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Bootstrap Google Drive tests with a fake adapter + knowledge service.
Verifies:
- Drive walk traverses root subfolders, respecting `maxDepth`.
- Every file triggers `requestIngestion` with `sourceKind="gdrive_item"`.
- Duplicate runs (same modifiedTime revision) report `skippedDuplicate`.
- Provenance carries `authority="google"` and the Drive file id.
- Recency filter skips files older than `maxAgeDays`.
"""
import asyncio
import os
import sys
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from types import SimpleNamespace
from typing import Any, Dict, List, Optional
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import (
bootstrapGdrive,
GdriveBootstrapLimits,
_syntheticFileId,
)
@dataclass
class _ExtEntry:
name: str
path: str
isFolder: bool = False
size: Optional[int] = None
mimeType: Optional[str] = None
metadata: Dict[str, Any] = None
def _today_iso(offsetDays: int = 0) -> str:
return (datetime.now(timezone.utc) + timedelta(days=offsetDays)).strftime("%Y-%m-%dT%H:%M:%SZ")
class _FakeDriveAdapter:
"""Minimal DriveAdapter stand-in.
Layout:
"/" (root) 2 files + 1 folder (sub)
"/sub_id" 1 file
"""
def __init__(self, recent_only: bool = True):
self.downloaded: List[str] = []
self._recent = _today_iso(0)
self._old = _today_iso(-400)
self._recent_only = recent_only
async def browse(self, path: str, filter=None, limit=None):
if path in ("/", "", "root"):
return [
_ExtEntry(
name="f1.txt", path="/f1", size=20,
mimeType="text/plain",
metadata={"id": "f1", "modifiedTime": self._recent},
),
_ExtEntry(
name="f2.txt", path="/f2", size=20,
mimeType="text/plain",
metadata={"id": "f2", "modifiedTime": self._recent if self._recent_only else self._old},
),
_ExtEntry(
name="Subfolder", path="/sub_id", isFolder=True,
mimeType="application/vnd.google-apps.folder",
metadata={"id": "sub_id", "modifiedTime": self._recent},
),
]
if path == "/sub_id":
return [
_ExtEntry(
name="f3.txt", path="/f3", size=20,
mimeType="text/plain",
metadata={"id": "f3", "modifiedTime": self._recent},
),
]
return []
async def download(self, path: str) -> bytes:
self.downloaded.append(path)
return path.encode("utf-8")
class _FakeKnowledgeService:
def __init__(self, duplicateIds=None):
self.calls: List[SimpleNamespace] = []
self._duplicateIds = duplicateIds or set()
async def requestIngestion(self, job):
self.calls.append(job)
status = "duplicate" if job.sourceId in self._duplicateIds else "indexed"
return SimpleNamespace(
jobId=f"{job.sourceKind}:{job.sourceId}",
status=status, contentHash="h",
fileId=job.sourceId, index=None, error=None,
)
def _fakeRunExtraction(data, name, mime, options):
return SimpleNamespace(
parts=[
SimpleNamespace(
id="p1",
data=data.decode("utf-8") if isinstance(data, bytes) else str(data),
typeGroup="text",
label="page:1",
metadata={"pageIndex": 0},
)
]
)
def test_bootstrap_walks_drive_and_subfolders():
adapter = _FakeDriveAdapter()
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapGdrive(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
runExtractionFn=_fakeRunExtraction,
limits=GdriveBootstrapLimits(maxAgeDays=None),
)
result = asyncio.run(_run())
assert len(knowledge.calls) == 3
sourceIds = {c.sourceId for c in knowledge.calls}
assert sourceIds == {
_syntheticFileId("c1", "f1"),
_syntheticFileId("c1", "f2"),
_syntheticFileId("c1", "f3"),
}
assert result["indexed"] == 3
assert result["skippedDuplicate"] == 0
assert adapter.downloaded == ["/f1", "/f2", "/f3"]
def test_bootstrap_reports_duplicates_on_second_run():
adapter = _FakeDriveAdapter()
duplicateIds = {
_syntheticFileId("c1", "f1"),
_syntheticFileId("c1", "f2"),
_syntheticFileId("c1", "f3"),
}
knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds)
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapGdrive(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
runExtractionFn=_fakeRunExtraction,
limits=GdriveBootstrapLimits(maxAgeDays=None),
)
result = asyncio.run(_run())
assert result["indexed"] == 0
assert result["skippedDuplicate"] == 3
def test_bootstrap_skips_files_older_than_maxAgeDays():
adapter = _FakeDriveAdapter(recent_only=False) # f2 is 400 days old
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapGdrive(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
runExtractionFn=_fakeRunExtraction,
limits=GdriveBootstrapLimits(maxAgeDays=180),
)
result = asyncio.run(_run())
assert result["indexed"] == 2 # f1, f3
assert result["skippedPolicy"] == 1 # f2 filtered out
def test_bootstrap_passes_connection_provenance():
adapter = _FakeDriveAdapter()
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapGdrive(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
runExtractionFn=_fakeRunExtraction,
limits=GdriveBootstrapLimits(maxAgeDays=None),
)
asyncio.run(_run())
for job in knowledge.calls:
assert job.sourceKind == "gdrive_item"
assert job.mandateId == "m1"
assert job.provenance["connectionId"] == "c1"
assert job.provenance["authority"] == "google"
assert job.provenance["service"] == "drive"
assert job.contentVersion # modifiedTime ISO string
if __name__ == "__main__":
test_bootstrap_walks_drive_and_subfolders()
test_bootstrap_reports_duplicates_on_second_run()
test_bootstrap_skips_files_older_than_maxAgeDays()
test_bootstrap_passes_connection_provenance()
print("OK — bootstrapGdrive tests passed")

View file

@ -0,0 +1,240 @@
#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Bootstrap Gmail tests with a fake googleGet + knowledge service.
Verifies:
- Default labels (INBOX + SENT) are traversed.
- Each message produces a requestIngestion call with sourceKind=gmail_message
and structured contentObjects (header / snippet / body).
- Pagination via `nextPageToken` is followed.
- historyId is forwarded as contentVersion idempotency.
- MIME body extraction walks nested parts (multipart/alternative).
"""
import asyncio
import base64
import os
import sys
from types import SimpleNamespace
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
bootstrapGmail,
GmailBootstrapLimits,
_syntheticMessageId,
_buildContentObjects,
_walkPayloadForBody,
)
def _b64url(text: str) -> str:
return base64.urlsafe_b64encode(text.encode("utf-8")).decode("ascii").rstrip("=")
def _msg(mid: str, subject: str = "Hi", body: str = "Hello world", historyId: str = "h1"):
return {
"id": mid,
"threadId": f"thread-{mid}",
"historyId": historyId,
"internalDate": "1700000000000",
"snippet": body[:120],
"payload": {
"headers": [
{"name": "Subject", "value": subject},
{"name": "From", "value": "Alice <a@x.com>"},
{"name": "To", "value": "Bob <b@x.com>"},
{"name": "Date", "value": "Tue, 01 Jan 2025 10:00:00 +0000"},
],
"mimeType": "text/plain",
"body": {"data": _b64url(body), "size": len(body)},
"parts": [],
},
}
class _FakeGoogleGet:
"""Records URLs + returns the wired-up page or message response."""
def __init__(self, messages_by_label, paginated_label=None, page2=None):
self._messages = messages_by_label
self._paginated = paginated_label
self._page2 = page2 or []
self._served_first_page = set()
self.requested = []
async def __call__(self, url: str):
self.requested.append(url)
# List page: contains `/users/me/messages?labelIds=...`
if "/users/me/messages?" in url:
for label, msgs in self._messages.items():
if f"labelIds={label}" in url:
if (
label == self._paginated
and label not in self._served_first_page
):
self._served_first_page.add(label)
return {
"messages": [{"id": m["id"]} for m in msgs],
"nextPageToken": "token-2",
}
if label == self._paginated and "pageToken=token-2" in url:
return {
"messages": [{"id": m["id"]} for m in self._page2],
}
return {"messages": [{"id": m["id"]} for m in msgs]}
return {"messages": []}
# Detail fetch: /users/me/messages/{id}?format=full
if "/users/me/messages/" in url and "format=full" in url:
msgId = url.split("/users/me/messages/")[-1].split("?")[0]
for msgs in self._messages.values():
for m in msgs:
if m["id"] == msgId:
return m
for m in self._page2:
if m["id"] == msgId:
return m
return {"error": "not found"}
class _FakeKnowledgeService:
def __init__(self, duplicateIds=None):
self.calls = []
self._duplicates = duplicateIds or set()
async def requestIngestion(self, job):
self.calls.append(job)
status = "duplicate" if job.sourceId in self._duplicates else "indexed"
return SimpleNamespace(
jobId=job.sourceId, status=status, contentHash="h",
fileId=job.sourceId, index=None, error=None,
)
def test_buildContentObjects_emits_header_snippet_body():
parts = _buildContentObjects(_msg("m1", body="Hello\nWorld"), maxBodyChars=8000)
ids = [p["contentObjectId"] for p in parts]
assert ids == ["header", "snippet", "body"]
header = parts[0]["data"]
assert "Subject: Hi" in header
assert "From: Alice <a@x.com>" in header
assert "To: Bob <b@x.com>" in header
def test_walkPayloadForBody_prefers_plain_over_html():
payload = {
"mimeType": "multipart/alternative",
"parts": [
{"mimeType": "text/plain", "body": {"data": _b64url("plain body")}},
{"mimeType": "text/html", "body": {"data": _b64url("<p>html body</p>")}},
],
}
bodies = _walkPayloadForBody(payload)
assert bodies["text"] == "plain body"
assert bodies["html"] == "<p>html body</p>"
def test_walkPayloadForBody_falls_back_to_html():
payload = {
"mimeType": "multipart/alternative",
"parts": [
{"mimeType": "text/html", "body": {"data": _b64url("<p>only html</p>")}},
],
}
bodies = _walkPayloadForBody(payload)
assert bodies["text"] == ""
assert "only html" in bodies["html"]
def test_bootstrap_gmail_indexes_messages_from_inbox_and_sent():
fake_get = _FakeGoogleGet({
"INBOX": [_msg("m1"), _msg("m2")],
"SENT": [_msg("m3")],
})
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapGmail(
connectionId="c1",
adapter=SimpleNamespace(_token="t"),
connection=connection,
knowledgeService=knowledge,
limits=GmailBootstrapLimits(maxAgeDays=None),
googleGetFn=fake_get,
)
result = asyncio.run(_run())
assert result["indexed"] == 3
sourceIds = {c.sourceId for c in knowledge.calls}
assert sourceIds == {
_syntheticMessageId("c1", "m1"),
_syntheticMessageId("c1", "m2"),
_syntheticMessageId("c1", "m3"),
}
for job in knowledge.calls:
assert job.sourceKind == "gmail_message"
assert job.mimeType == "message/rfc822"
assert job.provenance["connectionId"] == "c1"
assert job.provenance["authority"] == "google"
assert job.provenance["service"] == "gmail"
assert job.contentVersion == "h1"
assert any(co["contentObjectId"] == "header" for co in job.contentObjects)
def test_bootstrap_gmail_follows_pagination():
fake_get = _FakeGoogleGet(
messages_by_label={"INBOX": [_msg("m1")], "SENT": []},
paginated_label="INBOX",
page2=[_msg("m2"), _msg("m3")],
)
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapGmail(
connectionId="c1",
adapter=SimpleNamespace(_token="t"),
connection=connection,
knowledgeService=knowledge,
limits=GmailBootstrapLimits(maxAgeDays=None),
googleGetFn=fake_get,
)
result = asyncio.run(_run())
assert result["indexed"] == 3
def test_bootstrap_gmail_reports_duplicates():
fake_get = _FakeGoogleGet({"INBOX": [_msg("m1"), _msg("m2")], "SENT": []})
duplicates = {
_syntheticMessageId("c1", "m1"),
_syntheticMessageId("c1", "m2"),
}
knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapGmail(
connectionId="c1",
adapter=SimpleNamespace(_token="t"),
connection=connection,
knowledgeService=knowledge,
limits=GmailBootstrapLimits(maxAgeDays=None),
googleGetFn=fake_get,
)
result = asyncio.run(_run())
assert result["indexed"] == 0
assert result["skippedDuplicate"] == 2
if __name__ == "__main__":
test_buildContentObjects_emits_header_snippet_body()
test_walkPayloadForBody_prefers_plain_over_html()
test_walkPayloadForBody_falls_back_to_html()
test_bootstrap_gmail_indexes_messages_from_inbox_and_sent()
test_bootstrap_gmail_follows_pagination()
test_bootstrap_gmail_reports_duplicates()
print("OK — bootstrapGmail tests passed")

View file

@ -0,0 +1,190 @@
#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Bootstrap Outlook tests with a fake adapter + knowledge service.
Verifies:
- Well-known folders (inbox, sentitems) are discovered via Graph.
- Each message produces a `requestIngestion` call with sourceKind=outlook_message
and structured contentObjects (header / snippet / body).
- Pagination via `@odata.nextLink` is followed.
- changeKey is forwarded as contentVersion idempotency.
"""
import asyncio
import os
import sys
from types import SimpleNamespace
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import (
bootstrapOutlook,
OutlookBootstrapLimits,
_syntheticMessageId,
_buildContentObjects,
)
class _FakeOutlookAdapter:
def __init__(self, messages_by_folder, paginated_folder=None, page2=None):
self._folders = {"inbox": "INBOX-ID", "sentitems": "SENT-ID"}
self._messages = messages_by_folder
self._paginated_folder = paginated_folder
self._page2 = page2 or []
self.requested_endpoints = []
async def _graphGet(self, endpoint: str):
self.requested_endpoints.append(endpoint)
if endpoint.startswith("me/mailFolders/") and "/messages" not in endpoint:
wellKnown = endpoint.split("/")[-1]
fid = self._folders.get(wellKnown)
if not fid:
return {"error": "not found"}
return {"id": fid, "displayName": wellKnown}
# message page request: e.g. me/mailFolders/INBOX-ID/messages?...
for fid, messages in self._messages.items():
if f"me/mailFolders/{fid}/messages" in endpoint:
page = {"value": messages}
if fid == self._paginated_folder and "skiptoken" not in endpoint:
page["@odata.nextLink"] = (
"https://graph.microsoft.com/v1.0/"
f"me/mailFolders/{fid}/messages?$skiptoken=abc"
)
elif fid == self._paginated_folder and "skiptoken" in endpoint:
page = {"value": self._page2}
return page
return {"value": []}
async def browse(self, path):
return []
class _FakeKnowledgeService:
def __init__(self, duplicateIds=None):
self.calls = []
self._duplicates = duplicateIds or set()
async def requestIngestion(self, job):
self.calls.append(job)
status = "duplicate" if job.sourceId in self._duplicates else "indexed"
return SimpleNamespace(
jobId=job.sourceId, status=status, contentHash="h",
fileId=job.sourceId, index=None, error=None,
)
def _msg(mid: str, subject: str = "Hi", change: str = "ck1"):
return {
"id": mid,
"subject": subject,
"from": {"emailAddress": {"name": "Alice", "address": "a@x.com"}},
"toRecipients": [{"emailAddress": {"name": "Bob", "address": "b@x.com"}}],
"ccRecipients": [],
"receivedDateTime": "2025-01-01T10:00:00Z",
"bodyPreview": "Hello world",
"body": {"contentType": "text", "content": "Hello world\nThis is the body."},
"internetMessageId": f"<{mid}@local>",
"hasAttachments": False,
"changeKey": change,
}
def test_buildContentObjects_emits_header_snippet_body():
parts = _buildContentObjects(_msg("m1"), maxBodyChars=8000)
ids = [p["contentObjectId"] for p in parts]
assert ids == ["header", "snippet", "body"]
header = parts[0]["data"]
assert "Subject: Hi" in header
assert "From: Alice <a@x.com>" in header
assert "To: Bob <b@x.com>" in header
def test_bootstrap_outlook_indexes_messages_from_inbox_and_sent():
adapter = _FakeOutlookAdapter({
"INBOX-ID": [_msg("m1"), _msg("m2")],
"SENT-ID": [_msg("m3")],
})
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapOutlook(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
limits=OutlookBootstrapLimits(maxAgeDays=None),
)
result = asyncio.run(_run())
assert result["indexed"] == 3
sourceIds = {c.sourceId for c in knowledge.calls}
assert sourceIds == {
_syntheticMessageId("c1", "m1"),
_syntheticMessageId("c1", "m2"),
_syntheticMessageId("c1", "m3"),
}
for job in knowledge.calls:
assert job.sourceKind == "outlook_message"
assert job.mimeType == "message/rfc822"
assert job.provenance["connectionId"] == "c1"
assert job.provenance["service"] == "outlook"
assert job.contentVersion == "ck1"
assert any(co["contentObjectId"] == "header" for co in job.contentObjects)
def test_bootstrap_outlook_follows_pagination():
adapter = _FakeOutlookAdapter(
messages_by_folder={"INBOX-ID": [_msg("m1")], "SENT-ID": []},
paginated_folder="INBOX-ID",
page2=[_msg("m2"), _msg("m3")],
)
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapOutlook(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
limits=OutlookBootstrapLimits(maxAgeDays=None),
)
result = asyncio.run(_run())
assert result["indexed"] == 3
def test_bootstrap_outlook_reports_duplicates():
adapter = _FakeOutlookAdapter({
"INBOX-ID": [_msg("m1"), _msg("m2")],
"SENT-ID": [],
})
duplicates = {
_syntheticMessageId("c1", "m1"),
_syntheticMessageId("c1", "m2"),
}
knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapOutlook(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
limits=OutlookBootstrapLimits(maxAgeDays=None),
)
result = asyncio.run(_run())
assert result["indexed"] == 0
assert result["skippedDuplicate"] == 2
if __name__ == "__main__":
test_buildContentObjects_emits_header_snippet_body()
test_bootstrap_outlook_indexes_messages_from_inbox_and_sent()
test_bootstrap_outlook_follows_pagination()
test_bootstrap_outlook_reports_duplicates()
print("OK — bootstrapOutlook tests passed")

View file

@ -0,0 +1,209 @@
#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Bootstrap SharePoint tests with a fake adapter + knowledge service.
Verifies:
- Every discovered file triggers `requestIngestion`.
- Duplicate runs (same eTag revisions) report `skippedDuplicate`.
- Synthetic fileIds are stable across runs so idempotency works end-to-end.
"""
import asyncio
import os
import sys
from dataclasses import dataclass
from types import SimpleNamespace
from typing import Any, Dict, List, Optional
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
bootstrapSharepoint,
_syntheticFileId,
)
@dataclass
class _ExtEntry:
name: str
path: str
isFolder: bool = False
size: Optional[int] = None
mimeType: Optional[str] = None
metadata: Dict[str, Any] = None
class _FakeSpAdapter:
"""Minimal SharepointAdapter stand-in.
Layout:
"/" 1 site
"/sites/site-1" 2 files (f1, f2) + 1 folder (sub)
"/sites/site-1/sub" 1 file (f3)
"""
def __init__(self):
self.downloaded: List[str] = []
async def browse(self, path: str, filter=None, limit=None):
if path == "/":
return [
_ExtEntry(
name="Site 1",
path="/sites/site-1",
isFolder=True,
metadata={"id": "site-1"},
),
]
if path == "/sites/site-1":
return [
_ExtEntry(
name="f1.txt", path="/sites/site-1/f1.txt",
mimeType="text/plain", size=20,
metadata={"id": "f1", "revision": "etag-f1"},
),
_ExtEntry(
name="f2.txt", path="/sites/site-1/f2.txt",
mimeType="text/plain", size=20,
metadata={"id": "f2", "revision": "etag-f2"},
),
_ExtEntry(
name="sub", path="/sites/site-1/sub",
isFolder=True, metadata={"id": "sub"},
),
]
if path == "/sites/site-1/sub":
return [
_ExtEntry(
name="f3.txt", path="/sites/site-1/sub/f3.txt",
mimeType="text/plain", size=20,
metadata={"id": "f3", "revision": "etag-f3"},
),
]
return []
async def download(self, path: str) -> bytes:
self.downloaded.append(path)
return path.encode("utf-8")
class _FakeKnowledgeService:
"""Records requestIngestion calls and returns the scripted handles."""
def __init__(self, duplicateIds=None):
self.calls: List[SimpleNamespace] = []
self._duplicateIds = duplicateIds or set()
async def requestIngestion(self, job):
self.calls.append(job)
status = "duplicate" if job.sourceId in self._duplicateIds else "indexed"
return SimpleNamespace(
jobId=f"{job.sourceKind}:{job.sourceId}",
status=status,
contentHash="h",
fileId=job.sourceId,
index=None,
error=None,
)
def _fakeRunExtraction(data, name, mime, options):
"""Produce a single synthetic text part so `_toContentObjects` returns one."""
return SimpleNamespace(
parts=[
SimpleNamespace(
id="p1",
data=data.decode("utf-8") if isinstance(data, bytes) else str(data),
typeGroup="text",
label="page:1",
metadata={"pageIndex": 0},
)
]
)
def test_bootstrap_walks_sites_and_subfolders():
adapter = _FakeSpAdapter()
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapSharepoint(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
runExtractionFn=_fakeRunExtraction,
)
result = asyncio.run(_run())
assert len(knowledge.calls) == 3
sourceIds = {c.sourceId for c in knowledge.calls}
assert sourceIds == {
_syntheticFileId("c1", "f1"),
_syntheticFileId("c1", "f2"),
_syntheticFileId("c1", "f3"),
}
assert result["indexed"] == 3
assert result["skippedDuplicate"] == 0
assert adapter.downloaded == [
"/sites/site-1/f1.txt",
"/sites/site-1/f2.txt",
"/sites/site-1/sub/f3.txt",
]
def test_bootstrap_reports_duplicates_on_second_run():
adapter = _FakeSpAdapter()
duplicateIds = {
_syntheticFileId("c1", "f1"),
_syntheticFileId("c1", "f2"),
_syntheticFileId("c1", "f3"),
}
knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds)
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapSharepoint(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
runExtractionFn=_fakeRunExtraction,
)
result = asyncio.run(_run())
assert result["indexed"] == 0
assert result["skippedDuplicate"] == 3
def test_bootstrap_passes_connection_provenance():
adapter = _FakeSpAdapter()
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapSharepoint(
connectionId="c1",
adapter=adapter,
connection=connection,
knowledgeService=knowledge,
runExtractionFn=_fakeRunExtraction,
)
asyncio.run(_run())
for job in knowledge.calls:
assert job.sourceKind == "sharepoint_item"
assert job.mandateId == "m1"
assert job.provenance["connectionId"] == "c1"
assert job.provenance["authority"] == "msft"
assert job.provenance["service"] == "sharepoint"
assert job.contentVersion and job.contentVersion.startswith("etag-")
if __name__ == "__main__":
test_bootstrap_walks_sites_and_subfolders()
test_bootstrap_reports_duplicates_on_second_run()
test_bootstrap_passes_connection_provenance()
print("OK — bootstrapSharepoint tests passed")

View file

@ -0,0 +1,110 @@
#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Unit tests for cleanEmailBody.
Covers: HTMLtext normalisation, quoted-reply removal, signature removal,
whitespace collapse and truncation. The utility is used during Outlook
bootstrap; buggy cleaning would leak quoted threads / signatures into every
embedding.
"""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.subTextClean import (
cleanEmailBody,
)
def test_strips_html_tags_and_scripts():
html = (
"<html><head><style>body{}</style></head>"
"<body><p>Hello <b>world</b></p>"
"<script>alert('x')</script></body></html>"
)
cleaned = cleanEmailBody(html)
assert "Hello" in cleaned
assert "world" in cleaned
assert "<" not in cleaned
assert "alert" not in cleaned
def test_strips_quoted_reply_english():
body = (
"Actual answer from me.\n\n"
"On Mon, 1 Jan 2024 at 10:00, Someone <s@x.com> wrote:\n"
"> Original question?\n"
"> Second line.\n"
)
cleaned = cleanEmailBody(body)
assert "Actual answer" in cleaned
assert "Original question" not in cleaned
assert "wrote:" not in cleaned
def test_strips_quoted_reply_german():
body = (
"Meine Antwort.\n\n"
"Am 1. Januar 2024 um 10:00 schrieb Max Muster <m@x.com>:\n"
"> Ursprüngliche Frage?\n"
)
cleaned = cleanEmailBody(body)
assert "Meine Antwort" in cleaned
assert "Ursprüngliche Frage" not in cleaned
def test_strips_signature_after_dashes():
body = (
"Kurze Nachricht.\n"
"\n"
"--\n"
"Max Muster\n"
"Vorstand, Beispiel GmbH\n"
)
cleaned = cleanEmailBody(body)
assert "Kurze Nachricht" in cleaned
assert "Beispiel GmbH" not in cleaned
def test_strips_signature_salutation_de():
body = (
"Die eigentliche Information steht hier.\n\n"
"Mit freundlichen Grüßen\n"
"Max Muster"
)
cleaned = cleanEmailBody(body)
assert "eigentliche Information" in cleaned
assert "Max Muster" not in cleaned
def test_truncate_to_max_chars():
body = "abc " * 5000
cleaned = cleanEmailBody(body, maxChars=200)
assert len(cleaned) <= 201 # includes trailing ellipsis
def test_empty_input_returns_empty_string():
assert cleanEmailBody("") == ""
assert cleanEmailBody(None) == "" # type: ignore[arg-type]
def test_collapses_whitespace():
body = "A lot of spaces\n\n\n\nand blank lines"
cleaned = cleanEmailBody(body)
assert " " not in cleaned
assert "\n\n\n" not in cleaned
if __name__ == "__main__":
test_strips_html_tags_and_scripts()
test_strips_quoted_reply_english()
test_strips_quoted_reply_german()
test_strips_signature_after_dashes()
test_strips_signature_salutation_de()
test_truncate_to_max_chars()
test_empty_input_returns_empty_string()
test_collapses_whitespace()
print("OK — cleanEmailBody tests passed")

View file

@ -0,0 +1,119 @@
#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Purge tests for KnowledgeObjects.deleteFileContentIndexByConnectionId.
Ensures that a `connection.revoked` event wipes every FileContentIndex + chunk
linked to the given connectionId while leaving entries from other connections
(or upload-files with connectionId=None) intact.
"""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.datamodels.datamodelKnowledge import FileContentIndex, ContentChunk
from modules.interfaces.interfaceDbKnowledge import KnowledgeObjects
class _FakeDb:
"""Minimal in-memory stand-in for ``KnowledgeObjects.db``.
Supports just the subset of APIs that deleteFileContentIndexByConnectionId
touches: getRecordset(FileContentIndex|ContentChunk, ...) + recordDelete.
"""
def __init__(self):
self.indexRows: dict = {}
self.chunks: dict = {}
def addIndex(self, row: dict) -> None:
self.indexRows[row["id"]] = row
def addChunk(self, row: dict) -> None:
self.chunks[row["id"]] = row
def getRecordset(self, modelClass, recordFilter=None, **_):
filter_ = recordFilter or {}
if modelClass is FileContentIndex:
rows = list(self.indexRows.values())
elif modelClass is ContentChunk:
rows = list(self.chunks.values())
else:
return []
def match(row):
for k, v in filter_.items():
if row.get(k) != v:
return False
return True
return [r for r in rows if match(r)]
def recordDelete(self, modelClass, recordId):
if modelClass is FileContentIndex:
return self.indexRows.pop(recordId, None) is not None
if modelClass is ContentChunk:
return self.chunks.pop(recordId, None) is not None
return False
def _buildKnowledge():
"""Instantiate KnowledgeObjects without triggering the real DB bootstrap."""
ko = KnowledgeObjects.__new__(KnowledgeObjects)
ko.currentUser = None
ko.userId = None
ko._scopeCache = {}
ko.db = _FakeDb()
return ko
def test_purge_by_connection_removes_only_matching_rows():
ko = _buildKnowledge()
ko.db.addIndex({"id": "sp1", "connectionId": "cx", "mandateId": "m1", "sourceKind": "sharepoint_item"})
ko.db.addIndex({"id": "sp2", "connectionId": "cx", "mandateId": "m1", "sourceKind": "sharepoint_item"})
ko.db.addIndex({"id": "upload", "connectionId": None, "mandateId": "m1", "sourceKind": "file"})
ko.db.addIndex({"id": "other", "connectionId": "cy", "mandateId": "m1", "sourceKind": "outlook_message"})
ko.db.addChunk({"id": "c1", "fileId": "sp1"})
ko.db.addChunk({"id": "c2", "fileId": "sp1"})
ko.db.addChunk({"id": "c3", "fileId": "sp2"})
ko.db.addChunk({"id": "c4", "fileId": "upload"})
ko.db.addChunk({"id": "c5", "fileId": "other"})
result = ko.deleteFileContentIndexByConnectionId("cx")
assert result == {"indexRows": 2, "chunks": 3}
assert "sp1" not in ko.db.indexRows
assert "sp2" not in ko.db.indexRows
assert "upload" in ko.db.indexRows
assert "other" in ko.db.indexRows
assert set(ko.db.chunks.keys()) == {"c4", "c5"}
def test_purge_with_empty_connection_id_is_a_noop():
ko = _buildKnowledge()
ko.db.addIndex({"id": "sp1", "connectionId": "cx"})
ko.db.addChunk({"id": "c1", "fileId": "sp1"})
result = ko.deleteFileContentIndexByConnectionId("")
assert result == {"indexRows": 0, "chunks": 0}
assert "sp1" in ko.db.indexRows
def test_purge_unknown_connection_returns_zero():
ko = _buildKnowledge()
ko.db.addIndex({"id": "sp1", "connectionId": "cx"})
result = ko.deleteFileContentIndexByConnectionId("nope")
assert result == {"indexRows": 0, "chunks": 0}
assert "sp1" in ko.db.indexRows
if __name__ == "__main__":
test_purge_by_connection_removes_only_matching_rows()
test_purge_with_empty_connection_id_is_a_noop()
test_purge_unknown_connection_returns_zero()
print("OK — connection-purge tests passed")

View file

@ -0,0 +1,124 @@
#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Test that runExtraction preserves per-part granularity when mergeStrategy=None.
The default MergeStrategy concatenates all text parts into a single ContentPart, which
collapses multi-page documents into one blob. This destroys RAG retrieval because every
document ends up as a single ContentChunk with a "blurred average" embedding.
Ingestion pipelines (requestIngestion callers) MUST pass mergeStrategy=None to preserve
per-page / per-section chunks.
"""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.datamodels.datamodelExtraction import (
ContentPart,
ExtractionOptions,
MergeStrategy,
)
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ChunkerRegistry,
Extractor,
ExtractorRegistry,
)
class _FakeMultiPagePdfExtractor(Extractor):
"""Emits one text ContentPart per simulated page."""
def __init__(self, pageCount: int = 10):
self.pageCount = pageCount
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/pdf"
def getSupportedExtensions(self):
return [".pdf"]
def getSupportedMimeTypes(self):
return ["application/pdf"]
def extract(self, fileBytes: bytes, context):
return [
ContentPart(
id=f"page-{i}",
parentId=None,
label=f"page_{i + 1}",
typeGroup="text",
mimeType="text/plain",
data=f"Page {i + 1} content — distinct semantic anchor #{i}",
metadata={"pageIndex": i, "size": 64},
)
for i in range(self.pageCount)
]
def _buildRegistry(pageCount: int) -> ExtractorRegistry:
registry = ExtractorRegistry()
fake = _FakeMultiPagePdfExtractor(pageCount)
registry.register("application/pdf", fake)
registry.register("pdf", fake)
return registry
def test_default_options_merge_all_text_parts_into_one():
"""Regression safeguard: default ExtractionOptions still merges (legacy behaviour).
Non-ingestion callers (AI processing, summarization) rely on this default.
"""
registry = _buildRegistry(pageCount=5)
extracted = runExtraction(
registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
ExtractionOptions(),
)
textParts = [p for p in extracted.parts if p.typeGroup == "text"]
assert len(textParts) == 1, (
f"Default options should merge all text parts into one, got {len(textParts)}"
)
assert "Page 1" in textParts[0].data and "Page 5" in textParts[0].data, (
"Merged text should contain content from all pages"
)
print("test_default_options_merge_all_text_parts_into_one [PASS]")
def test_merge_none_preserves_all_text_parts():
"""Core fix: mergeStrategy=None preserves per-page granularity for RAG ingestion."""
registry = _buildRegistry(pageCount=500)
extracted = runExtraction(
registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
ExtractionOptions(mergeStrategy=None),
)
textParts = [p for p in extracted.parts if p.typeGroup == "text"]
assert len(textParts) == 500, (
f"mergeStrategy=None should preserve all 500 text parts, got {len(textParts)}"
)
assert textParts[0].label == "page_1"
assert textParts[-1].label == "page_500"
print("test_merge_none_preserves_all_text_parts [PASS]")
def test_explicit_merge_strategy_still_merges():
"""Callers can still opt in to merging by passing an explicit MergeStrategy."""
registry = _buildRegistry(pageCount=3)
extracted = runExtraction(
registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
ExtractionOptions(mergeStrategy=MergeStrategy()),
)
textParts = [p for p in extracted.parts if p.typeGroup == "text"]
assert len(textParts) == 1, (
f"Explicit MergeStrategy should merge, got {len(textParts)} parts"
)
print("test_explicit_merge_strategy_still_merges [PASS]")
if __name__ == "__main__":
test_default_options_merge_all_text_parts_into_one()
test_merge_none_preserves_all_text_parts()
test_explicit_merge_strategy_still_merges()
print("\nAll merge-strategy tests passed.")

View file

@ -0,0 +1,81 @@
#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Test that _computeIngestionHash is stable across re-extractions of the same source.
Extractors generate fresh contentObjectIds (uuid.uuid4()) per run. The ingestion
hash MUST therefore be derived from content (contentType + data + order) only
otherwise idempotency (AC4) silently fails: every re-extraction looks "new" and
triggers full re-embedding.
"""
import os
import sys
import uuid
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import (
_computeIngestionHash,
)
def _makeObjects(seed: str = "alpha"):
"""Build a synthetic contentObjects list as routeDataFiles._autoIndexFile would."""
return [
{
"contentObjectId": str(uuid.uuid4()),
"contentType": "text",
"data": f"Page 1 of {seed}",
},
{
"contentObjectId": str(uuid.uuid4()),
"contentType": "text",
"data": f"Page 2 of {seed}",
},
{
"contentObjectId": str(uuid.uuid4()),
"contentType": "binary",
"data": "<image-bytes-as-b64>",
},
]
def test_hash_stable_across_uuid_regeneration():
"""Same content + different contentObjectIds → same hash."""
a = _makeObjects("alpha")
b = _makeObjects("alpha") # identical data, fresh UUIDs
assert [o["contentObjectId"] for o in a] != [o["contentObjectId"] for o in b]
assert _computeIngestionHash(a) == _computeIngestionHash(b)
def test_hash_changes_when_data_changes():
a = _makeObjects("alpha")
b = _makeObjects("beta")
assert _computeIngestionHash(a) != _computeIngestionHash(b)
def test_hash_is_order_sensitive():
"""Reordered pages produce a different hash (different document)."""
a = _makeObjects("alpha")
b = list(reversed(a))
assert _computeIngestionHash(a) != _computeIngestionHash(b)
def test_hash_distinguishes_text_vs_binary_with_same_payload():
a = [{"contentObjectId": "x", "contentType": "text", "data": "hello"}]
b = [{"contentObjectId": "x", "contentType": "binary", "data": "hello"}]
assert _computeIngestionHash(a) != _computeIngestionHash(b)
def test_hash_handles_empty_input():
assert _computeIngestionHash([]) == _computeIngestionHash([])
if __name__ == "__main__":
test_hash_stable_across_uuid_regeneration()
test_hash_changes_when_data_changes()
test_hash_is_order_sensitive()
test_hash_distinguishes_text_vs_binary_with_same_payload()
test_hash_handles_empty_input()
print("OK — all 5 ingestion-hash stability tests passed")

View file

@ -0,0 +1,235 @@
#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Unit tests for KnowledgeIngestionConsumer event dispatch.
- `connection.established` enqueue a `connection.bootstrap` job.
- `connection.revoked` synchronous purge via KnowledgeObjects.
"""
import asyncio
import os
import sys
import types
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as consumer
def _resetRegistration(monkeypatch):
"""Force the module-level guard to register fresh in each test."""
monkeypatch.setattr(consumer, "_registered", False)
def test_onConnectionEstablished_enqueues_bootstrap(monkeypatch):
startedJobs = []
async def _fakeStartJob(jobType, payload, **kwargs):
startedJobs.append({"jobType": jobType, "payload": payload, "kwargs": kwargs})
return "job-1"
monkeypatch.setattr(consumer, "startJob", _fakeStartJob)
consumer._onConnectionEstablished(
connectionId="c1", authority="msft", userId="u1"
)
# Drain pending tasks created by the consumer.
loop = asyncio.new_event_loop()
try:
asyncio.set_event_loop(loop)
# If the consumer created a Task on a closed loop the fake startJob
# was still called synchronously via asyncio.run — in either case we
# check the recorded call.
finally:
loop.close()
assert len(startedJobs) == 1
assert startedJobs[0]["jobType"] == consumer.BOOTSTRAP_JOB_TYPE
assert startedJobs[0]["payload"]["connectionId"] == "c1"
assert startedJobs[0]["payload"]["authority"] == "msft"
assert startedJobs[0]["kwargs"]["triggeredBy"] == "u1"
def test_onConnectionEstablished_ignores_missing_id(monkeypatch):
called = []
async def _fakeStartJob(*a, **kw):
called.append(1)
return "x"
monkeypatch.setattr(consumer, "startJob", _fakeStartJob)
consumer._onConnectionEstablished(connectionId="", authority="msft")
assert called == []
def test_onConnectionRevoked_runs_sync_purge(monkeypatch):
class _FakeKnowledge:
def __init__(self):
self.calls = []
def deleteFileContentIndexByConnectionId(self, cid):
self.calls.append(cid)
return {"indexRows": 2, "chunks": 5}
fakeKnow = _FakeKnowledge()
def _fakeGetInterface(_user=None):
return fakeKnow
monkeypatch.setattr(consumer, "getKnowledgeInterface", _fakeGetInterface)
consumer._onConnectionRevoked(
connectionId="c1", authority="msft", userId="u1", reason="disconnected"
)
assert fakeKnow.calls == ["c1"]
def test_onConnectionRevoked_ignores_missing_id(monkeypatch):
seen = []
def _fakeGetInterface(_user=None):
class _K:
def deleteFileContentIndexByConnectionId(self, cid):
seen.append(cid)
return {"indexRows": 0, "chunks": 0}
return _K()
monkeypatch.setattr(consumer, "getKnowledgeInterface", _fakeGetInterface)
consumer._onConnectionRevoked(connectionId="")
assert seen == []
def test_bootstrap_job_skips_unsupported_authority(monkeypatch):
async def _run():
result = await consumer._bootstrapJobHandler(
{"payload": {"connectionId": "c1", "authority": "slack"}},
lambda *_: None,
)
return result
result = asyncio.run(_run())
assert result["skipped"] is True
assert result["authority"] == "slack"
assert result["reason"] == "unsupported_authority"
def test_bootstrap_job_dispatches_msft_parts(monkeypatch):
calls = {"sp": 0, "ol": 0}
async def _fakeSp(connectionId, progressCb=None):
calls["sp"] += 1
return {"indexed": 1}
async def _fakeOl(connectionId, progressCb=None):
calls["ol"] += 1
return {"indexed": 2}
fakeSharepoint = types.ModuleType("subConnectorSyncSharepoint")
fakeSharepoint.bootstrapSharepoint = _fakeSp
fakeOutlook = types.ModuleType("subConnectorSyncOutlook")
fakeOutlook.bootstrapOutlook = _fakeOl
monkeypatch.setitem(
sys.modules,
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint",
fakeSharepoint,
)
monkeypatch.setitem(
sys.modules,
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook",
fakeOutlook,
)
async def _run():
return await consumer._bootstrapJobHandler(
{"payload": {"connectionId": "c1", "authority": "msft"}},
lambda *_: None,
)
result = asyncio.run(_run())
assert calls == {"sp": 1, "ol": 1}
assert result["sharepoint"] == {"indexed": 1}
assert result["outlook"] == {"indexed": 2}
def test_bootstrap_job_dispatches_google_parts(monkeypatch):
calls = {"gd": 0, "gm": 0}
async def _fakeGd(connectionId, progressCb=None):
calls["gd"] += 1
return {"indexed": 7}
async def _fakeGm(connectionId, progressCb=None):
calls["gm"] += 1
return {"indexed": 11}
fakeGdrive = types.ModuleType("subConnectorSyncGdrive")
fakeGdrive.bootstrapGdrive = _fakeGd
fakeGmail = types.ModuleType("subConnectorSyncGmail")
fakeGmail.bootstrapGmail = _fakeGm
monkeypatch.setitem(
sys.modules,
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive",
fakeGdrive,
)
monkeypatch.setitem(
sys.modules,
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail",
fakeGmail,
)
async def _run():
return await consumer._bootstrapJobHandler(
{"payload": {"connectionId": "c1", "authority": "google"}},
lambda *_: None,
)
result = asyncio.run(_run())
assert calls == {"gd": 1, "gm": 1}
assert result["drive"] == {"indexed": 7}
assert result["gmail"] == {"indexed": 11}
def test_bootstrap_job_dispatches_clickup_part(monkeypatch):
calls = {"cu": 0}
async def _fakeCu(connectionId, progressCb=None):
calls["cu"] += 1
return {"indexed": 4}
fakeClickup = types.ModuleType("subConnectorSyncClickup")
fakeClickup.bootstrapClickup = _fakeCu
monkeypatch.setitem(
sys.modules,
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup",
fakeClickup,
)
async def _run():
return await consumer._bootstrapJobHandler(
{"payload": {"connectionId": "c1", "authority": "clickup"}},
lambda *_: None,
)
result = asyncio.run(_run())
assert calls == {"cu": 1}
assert result["clickup"] == {"indexed": 4}
if __name__ == "__main__":
# Usable without pytest fixtures for a quick smoke run.
class _MP:
def __init__(self):
self.undos = []
def setattr(self, target, name_or_value, value=None):
if value is None:
# target is an object, name_or_value is value → no, original signature
raise SystemExit("use pytest monkeypatch in CLI")
self.undos.append((target, name_or_value, getattr(target, name_or_value)))
setattr(target, name_or_value, value)
def setitem(self, mapping, key, value):
self.undos.append((mapping, key, mapping.get(key)))
mapping[key] = value
print("Run via pytest: pytest tests/unit/services/test_knowledge_ingest_consumer.py")

View file

@ -0,0 +1,298 @@
#!/usr/bin/env python3
"""Unit tests for P1d: consent gating, preference parsing, and walker behaviour.
Tests
-----
1. Bootstrap runner skips when ``knowledgeIngestionEnabled=False``.
2. ``loadConnectionPrefs`` returns safe defaults when preferences are absent.
3. ``loadConnectionPrefs`` maps all §2.6 keys correctly from a full prefs dict.
4. Gmail walker passes ``neutralize=True`` and ``mailContentDepth`` to IngestionJob.
5. Gmail walker produces only a header content-object when depth="metadata".
6. ClickUp walker skips description when scope="titles".
"""
from __future__ import annotations
import asyncio
import os
import sys
import types
import unittest
from typing import Any, Dict, Optional
from unittest.mock import AsyncMock, MagicMock, patch
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
# ---------------------------------------------------------------------------
# 1. Bootstrap runner consent gate
# ---------------------------------------------------------------------------
class TestBootstrapConsentGate(unittest.TestCase):
"""_bootstrapJobHandler must no-op when knowledgeIngestionEnabled is False."""
def _makeJob(self, connectionId="c-test", authority="google"):
return {"payload": {"connectionId": connectionId, "authority": authority}}
def _makeConn(self, enabled: bool):
conn = MagicMock()
conn.knowledgeIngestionEnabled = enabled
return conn
def test_skips_when_consent_disabled(self):
from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as sut
fake_root = MagicMock()
fake_root.getUserConnectionById.return_value = self._makeConn(False)
with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=fake_root):
result = asyncio.get_event_loop().run_until_complete(
sut._bootstrapJobHandler(self._makeJob(), lambda *a: None)
)
assert result.get("skipped") is True
assert result.get("reason") == "consent_disabled"
fake_root.getUserConnectionById.assert_called_once_with("c-test")
def test_proceeds_when_consent_enabled(self):
"""When consent is enabled, the handler should call at least one walker."""
from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as sut
fake_root = MagicMock()
fake_root.getUserConnectionById.return_value = self._makeConn(True)
# Patch the inner walker so it doesn't do real I/O.
async def _fakeBootstrap(**kwargs):
return {"indexed": 0}
with (
patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=fake_root),
patch(
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive.bootstrapGdrive",
new=AsyncMock(return_value={"indexed": 0}),
),
patch(
"modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail.bootstrapGmail",
new=AsyncMock(return_value={"indexed": 0}),
),
):
result = asyncio.get_event_loop().run_until_complete(
sut._bootstrapJobHandler(self._makeJob(authority="google"), lambda *a: None)
)
# Should not have 'skipped' at the top level.
assert result.get("skipped") is not True
assert result.get("authority") == "google"
# ---------------------------------------------------------------------------
# 2 + 3. loadConnectionPrefs
# ---------------------------------------------------------------------------
class TestLoadConnectionPrefs(unittest.TestCase):
def _makeConn(self, prefs: Optional[Dict[str, Any]]):
conn = MagicMock()
conn.knowledgePreferences = prefs
return conn
def _mockRoot(self, prefs):
root = MagicMock()
root.getUserConnectionById.return_value = self._makeConn(prefs)
return root
def test_returns_safe_defaults_when_prefs_none(self):
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import (
ConnectionIngestionPrefs,
loadConnectionPrefs,
)
with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(None)):
prefs = loadConnectionPrefs("x")
assert prefs.neutralizeBeforeEmbed is False
assert prefs.mailContentDepth == "full"
assert prefs.mailIndexAttachments is False
assert prefs.maxAgeDays == 90
assert prefs.clickupScope == "title_description"
assert prefs.gmailEnabled is True
assert prefs.driveEnabled is True
def test_maps_all_keys(self):
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
raw = {
"neutralizeBeforeEmbed": True,
"mailContentDepth": "metadata",
"mailIndexAttachments": True,
"filesIndexBinaries": False,
"clickupScope": "with_comments",
"maxAgeDays": 30,
"surfaceToggles": {
"google": {"gmail": False, "drive": True},
"msft": {"sharepoint": False, "outlook": True},
},
}
with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(raw)):
prefs = loadConnectionPrefs("x")
assert prefs.neutralizeBeforeEmbed is True
assert prefs.mailContentDepth == "metadata"
assert prefs.mailIndexAttachments is True
assert prefs.filesIndexBinaries is False
assert prefs.clickupScope == "with_comments"
assert prefs.maxAgeDays == 30
assert prefs.gmailEnabled is False
assert prefs.driveEnabled is True
assert prefs.sharepointEnabled is False
assert prefs.outlookEnabled is True
def test_invalid_depth_falls_back_to_default(self):
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
raw = {"mailContentDepth": "everything_please"}
with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(raw)):
prefs = loadConnectionPrefs("x")
assert prefs.mailContentDepth == "full"
# ---------------------------------------------------------------------------
# 4. Gmail walker passes neutralize + mailContentDepth to IngestionJob
# ---------------------------------------------------------------------------
class TestGmailWalkerPrefs(unittest.TestCase):
def _make_message(self, *, subject="Test", snippet="hello", body_text="full body"):
import base64
encoded = base64.urlsafe_b64encode(body_text.encode()).decode()
return {
"id": "msg-1",
"historyId": "h-42",
"threadId": "t-1",
"snippet": snippet,
"payload": {
"mimeType": "multipart/alternative",
"headers": [
{"name": "Subject", "value": subject},
{"name": "From", "value": "alice@example.com"},
{"name": "To", "value": "bob@example.com"},
{"name": "Date", "value": "Mon, 20 Apr 2026 10:00:00 +0000"},
],
"parts": [
{
"mimeType": "text/plain",
"body": {"data": encoded},
}
],
},
}
def test_neutralize_flag_forwarded(self):
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
GmailBootstrapLimits,
_ingestMessage,
GmailBootstrapResult,
)
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
captured_jobs = []
async def fake_requestIngestion(job: IngestionJob):
captured_jobs.append(job)
return MagicMock(status="indexed", error=None)
ks = MagicMock()
ks.requestIngestion = fake_requestIngestion
limits = GmailBootstrapLimits(neutralize=True, mailContentDepth="full")
result = GmailBootstrapResult(connectionId="c-1")
asyncio.get_event_loop().run_until_complete(
_ingestMessage(
googleGetFn=AsyncMock(return_value={}),
knowledgeService=ks,
connectionId="c-1",
mandateId="",
userId="u-1",
labelId="INBOX",
message=self._make_message(),
limits=limits,
result=result,
progressCb=None,
)
)
assert len(captured_jobs) == 1
assert captured_jobs[0].neutralize is True
def test_metadata_depth_yields_only_header(self):
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
_buildContentObjects,
)
message = self._make_message(snippet="hi", body_text="should be excluded")
parts = _buildContentObjects(message, maxBodyChars=4000, mailContentDepth="metadata")
ids = [p["contentObjectId"] for p in parts]
assert ids == ["header"]
def test_snippet_depth_yields_header_and_snippet(self):
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
_buildContentObjects,
)
message = self._make_message(snippet="hi", body_text="should be excluded")
parts = _buildContentObjects(message, maxBodyChars=4000, mailContentDepth="snippet")
ids = [p["contentObjectId"] for p in parts]
assert "header" in ids
assert "snippet" in ids
assert "body" not in ids
# ---------------------------------------------------------------------------
# 5. ClickUp walker respects clickupScope="titles"
# ---------------------------------------------------------------------------
class TestClickupWalkerScope(unittest.TestCase):
def _make_task(self):
return {
"id": "task-1",
"name": "Ship feature X",
"date_updated": "1713888000000",
"description": "This should be omitted",
"text_content": "Also omitted",
"status": {"status": "open"},
"assignees": [],
"tags": [],
"list": {"name": "Backlog"},
"folder": {},
"space": {"name": "Engineering"},
}
def test_titles_scope_omits_description(self):
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
ClickupBootstrapLimits,
_buildContentObjects,
)
limits = ClickupBootstrapLimits(clickupScope="titles")
parts = _buildContentObjects(self._make_task(), limits)
ids = [p["contentObjectId"] for p in parts]
assert ids == ["header"]
assert "description" not in ids
def test_with_description_scope_includes_description(self):
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
ClickupBootstrapLimits,
_buildContentObjects,
)
limits = ClickupBootstrapLimits(clickupScope="title_description")
parts = _buildContentObjects(self._make_task(), limits)
ids = [p["contentObjectId"] for p in parts]
assert "header" in ids
assert "description" in ids
if __name__ == "__main__":
unittest.main()